diff mbox

drm/i915: Flush outstanding unpin tasks before pageflipping

Message ID 1351761986-27982-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Nov. 1, 2012, 9:26 a.m. UTC
If we accumulate unpin tasks because we are pageflipping faster than the
system can schedule its workers, we can effectively create a
pin-leak. The solution taken here is to limit the number of unpin tasks
we have per-crtc and to flush those outstanding tasks if we accumulate
too many. This should prevent any jitter in the normal case, and also
prevent the hang if we should run too fast.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
 drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
 2 files changed, 19 insertions(+), 7 deletions(-)

Comments

Jesse Barnes Nov. 1, 2012, 3:07 p.m. UTC | #1
On Thu,  1 Nov 2012 09:26:26 +0000
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> If we accumulate unpin tasks because we are pageflipping faster than the
> system can schedule its workers, we can effectively create a
> pin-leak. The solution taken here is to limit the number of unpin tasks
> we have per-crtc and to flush those outstanding tasks if we accumulate
> too many. This should prevent any jitter in the normal case, and also
> prevent the hang if we should run too fast.
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
>  drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
>  2 files changed, 19 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 69b1739..800b195 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work)
>  {
>  	struct intel_unpin_work *work =
>  		container_of(__work, struct intel_unpin_work, work);
> +	struct drm_device *dev = work->crtc->dev;
>  
> -	mutex_lock(&work->dev->struct_mutex);
> +	mutex_lock(&dev->struct_mutex);
>  	intel_unpin_fb_obj(work->old_fb_obj);
>  	drm_gem_object_unreference(&work->pending_flip_obj->base);
>  	drm_gem_object_unreference(&work->old_fb_obj->base);
>  
> -	intel_update_fbc(work->dev);
> -	mutex_unlock(&work->dev->struct_mutex);
> +	intel_update_fbc(dev);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
> +	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
> +
>  	kfree(work);
>  }
>  
> @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev,
>  
>  	atomic_clear_mask(1 << intel_crtc->plane,
>  			  &obj->pending_flip.counter);
> -
>  	wake_up(&dev_priv->pending_flip_queue);
> -	schedule_work(&work->work);
> +
> +	queue_work(dev_priv->wq, &work->work);
>  
>  	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
>  }
> @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  		return -ENOMEM;
>  
>  	work->event = event;
> -	work->dev = crtc->dev;
> +	work->crtc = crtc;
>  	intel_fb = to_intel_framebuffer(crtc->fb);
>  	work->old_fb_obj = intel_fb->obj;
>  	INIT_WORK(&work->work, intel_unpin_work_fn);
> @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  	intel_fb = to_intel_framebuffer(fb);
>  	obj = intel_fb->obj;
>  
> +	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
> +		flush_workqueue(dev_priv->wq);
> +

Have you by chance tested this with the async flip patch?  I wonder if
in that case whether 2 is too small, and something like 100 might be
better (though really async flips are for cases where we can't keep up
with refresh, so a small number shouldn't hurt too much there either).
Chris Wilson Nov. 1, 2012, 3:18 p.m. UTC | #2
On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> On Thu,  1 Nov 2012 09:26:26 +0000
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > If we accumulate unpin tasks because we are pageflipping faster than the
> > system can schedule its workers, we can effectively create a
> > pin-leak. The solution taken here is to limit the number of unpin tasks
> > we have per-crtc and to flush those outstanding tasks if we accumulate
> > too many. This should prevent any jitter in the normal case, and also
> > prevent the hang if we should run too fast.
> > 
> > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
> >  drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
> >  2 files changed, 19 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> > index 69b1739..800b195 100644
> > --- a/drivers/gpu/drm/i915/intel_display.c
> > +++ b/drivers/gpu/drm/i915/intel_display.c
> > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work)
> >  {
> >  	struct intel_unpin_work *work =
> >  		container_of(__work, struct intel_unpin_work, work);
> > +	struct drm_device *dev = work->crtc->dev;
> >  
> > -	mutex_lock(&work->dev->struct_mutex);
> > +	mutex_lock(&dev->struct_mutex);
> >  	intel_unpin_fb_obj(work->old_fb_obj);
> >  	drm_gem_object_unreference(&work->pending_flip_obj->base);
> >  	drm_gem_object_unreference(&work->old_fb_obj->base);
> >  
> > -	intel_update_fbc(work->dev);
> > -	mutex_unlock(&work->dev->struct_mutex);
> > +	intel_update_fbc(dev);
> > +	mutex_unlock(&dev->struct_mutex);
> > +
> > +	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
> > +	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
> > +
> >  	kfree(work);
> >  }
> >  
> > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev,
> >  
> >  	atomic_clear_mask(1 << intel_crtc->plane,
> >  			  &obj->pending_flip.counter);
> > -
> >  	wake_up(&dev_priv->pending_flip_queue);
> > -	schedule_work(&work->work);
> > +
> > +	queue_work(dev_priv->wq, &work->work);
> >  
> >  	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
> >  }
> > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> >  		return -ENOMEM;
> >  
> >  	work->event = event;
> > -	work->dev = crtc->dev;
> > +	work->crtc = crtc;
> >  	intel_fb = to_intel_framebuffer(crtc->fb);
> >  	work->old_fb_obj = intel_fb->obj;
> >  	INIT_WORK(&work->work, intel_unpin_work_fn);
> > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> >  	intel_fb = to_intel_framebuffer(fb);
> >  	obj = intel_fb->obj;
> >  
> > +	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
> > +		flush_workqueue(dev_priv->wq);
> > +
> 
> Have you by chance tested this with the async flip patch?  I wonder if
> in that case whether 2 is too small, and something like 100 might be
> better (though really async flips are for cases where we can't keep up
> with refresh, so a small number shouldn't hurt too much there either).

The limit on 2 is due to the limited resolution of pincount. Hence my
earlier fear for your async flip patch.
-Chris
Daniel Vetter Nov. 1, 2012, 3:29 p.m. UTC | #3
On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote:
> On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > On Thu,  1 Nov 2012 09:26:26 +0000
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > 
> > > If we accumulate unpin tasks because we are pageflipping faster than the
> > > system can schedule its workers, we can effectively create a
> > > pin-leak. The solution taken here is to limit the number of unpin tasks
> > > we have per-crtc and to flush those outstanding tasks if we accumulate
> > > too many. This should prevent any jitter in the normal case, and also
> > > prevent the hang if we should run too fast.
> > > 
> > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > ---
> > >  drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
> > >  drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
> > >  2 files changed, 19 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> > > index 69b1739..800b195 100644
> > > --- a/drivers/gpu/drm/i915/intel_display.c
> > > +++ b/drivers/gpu/drm/i915/intel_display.c
> > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work)
> > >  {
> > >  	struct intel_unpin_work *work =
> > >  		container_of(__work, struct intel_unpin_work, work);
> > > +	struct drm_device *dev = work->crtc->dev;
> > >  
> > > -	mutex_lock(&work->dev->struct_mutex);
> > > +	mutex_lock(&dev->struct_mutex);
> > >  	intel_unpin_fb_obj(work->old_fb_obj);
> > >  	drm_gem_object_unreference(&work->pending_flip_obj->base);
> > >  	drm_gem_object_unreference(&work->old_fb_obj->base);
> > >  
> > > -	intel_update_fbc(work->dev);
> > > -	mutex_unlock(&work->dev->struct_mutex);
> > > +	intel_update_fbc(dev);
> > > +	mutex_unlock(&dev->struct_mutex);
> > > +
> > > +	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
> > > +	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
> > > +
> > >  	kfree(work);
> > >  }
> > >  
> > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev,
> > >  
> > >  	atomic_clear_mask(1 << intel_crtc->plane,
> > >  			  &obj->pending_flip.counter);
> > > -
> > >  	wake_up(&dev_priv->pending_flip_queue);
> > > -	schedule_work(&work->work);
> > > +
> > > +	queue_work(dev_priv->wq, &work->work);
> > >  
> > >  	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
> > >  }
> > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > >  		return -ENOMEM;
> > >  
> > >  	work->event = event;
> > > -	work->dev = crtc->dev;
> > > +	work->crtc = crtc;
> > >  	intel_fb = to_intel_framebuffer(crtc->fb);
> > >  	work->old_fb_obj = intel_fb->obj;
> > >  	INIT_WORK(&work->work, intel_unpin_work_fn);
> > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > >  	intel_fb = to_intel_framebuffer(fb);
> > >  	obj = intel_fb->obj;
> > >  
> > > +	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
> > > +		flush_workqueue(dev_priv->wq);
> > > +
> > 
> > Have you by chance tested this with the async flip patch?  I wonder if
> > in that case whether 2 is too small, and something like 100 might be
> > better (though really async flips are for cases where we can't keep up
> > with refresh, so a small number shouldn't hurt too much there either).
> 
> The limit on 2 is due to the limited resolution of pincount. Hence my
> earlier fear for your async flip patch.

I think for asyn flips we simply need to have a real flip queue in our
code, instead of abusing the implicit list in the workqueue code ...

One other thing is that with async flips we don't have a natural limit on
the number of pinned framebuffers any more, which means we can easily
exhaust all mappable GTT space. Hence we need to integrate that new,
explicit flip queue into our eviction code, too.

For now I'm rather happy with the flush_wq ducttape presented here ;-)

Cheers, Daniel
Jesse Barnes Nov. 1, 2012, 3:34 p.m. UTC | #4
On Thu, 1 Nov 2012 16:29:35 +0100
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote:
> > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > > On Thu,  1 Nov 2012 09:26:26 +0000
> > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > 
> > > > If we accumulate unpin tasks because we are pageflipping faster than the
> > > > system can schedule its workers, we can effectively create a
> > > > pin-leak. The solution taken here is to limit the number of unpin tasks
> > > > we have per-crtc and to flush those outstanding tasks if we accumulate
> > > > too many. This should prevent any jitter in the normal case, and also
> > > > prevent the hang if we should run too fast.
> > > > 
> > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > ---
> > > >  drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
> > > >  drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
> > > >  2 files changed, 19 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> > > > index 69b1739..800b195 100644
> > > > --- a/drivers/gpu/drm/i915/intel_display.c
> > > > +++ b/drivers/gpu/drm/i915/intel_display.c
> > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work)
> > > >  {
> > > >  	struct intel_unpin_work *work =
> > > >  		container_of(__work, struct intel_unpin_work, work);
> > > > +	struct drm_device *dev = work->crtc->dev;
> > > >  
> > > > -	mutex_lock(&work->dev->struct_mutex);
> > > > +	mutex_lock(&dev->struct_mutex);
> > > >  	intel_unpin_fb_obj(work->old_fb_obj);
> > > >  	drm_gem_object_unreference(&work->pending_flip_obj->base);
> > > >  	drm_gem_object_unreference(&work->old_fb_obj->base);
> > > >  
> > > > -	intel_update_fbc(work->dev);
> > > > -	mutex_unlock(&work->dev->struct_mutex);
> > > > +	intel_update_fbc(dev);
> > > > +	mutex_unlock(&dev->struct_mutex);
> > > > +
> > > > +	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
> > > > +	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
> > > > +
> > > >  	kfree(work);
> > > >  }
> > > >  
> > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev,
> > > >  
> > > >  	atomic_clear_mask(1 << intel_crtc->plane,
> > > >  			  &obj->pending_flip.counter);
> > > > -
> > > >  	wake_up(&dev_priv->pending_flip_queue);
> > > > -	schedule_work(&work->work);
> > > > +
> > > > +	queue_work(dev_priv->wq, &work->work);
> > > >  
> > > >  	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
> > > >  }
> > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > > >  		return -ENOMEM;
> > > >  
> > > >  	work->event = event;
> > > > -	work->dev = crtc->dev;
> > > > +	work->crtc = crtc;
> > > >  	intel_fb = to_intel_framebuffer(crtc->fb);
> > > >  	work->old_fb_obj = intel_fb->obj;
> > > >  	INIT_WORK(&work->work, intel_unpin_work_fn);
> > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > > >  	intel_fb = to_intel_framebuffer(fb);
> > > >  	obj = intel_fb->obj;
> > > >  
> > > > +	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
> > > > +		flush_workqueue(dev_priv->wq);
> > > > +
> > > 
> > > Have you by chance tested this with the async flip patch?  I wonder if
> > > in that case whether 2 is too small, and something like 100 might be
> > > better (though really async flips are for cases where we can't keep up
> > > with refresh, so a small number shouldn't hurt too much there either).
> > 
> > The limit on 2 is due to the limited resolution of pincount. Hence my
> > earlier fear for your async flip patch.
> 
> I think for asyn flips we simply need to have a real flip queue in our
> code, instead of abusing the implicit list in the workqueue code ...
> 
> One other thing is that with async flips we don't have a natural limit on
> the number of pinned framebuffers any more, which means we can easily
> exhaust all mappable GTT space. Hence we need to integrate that new,
> explicit flip queue into our eviction code, too.
> 
> For now I'm rather happy with the flush_wq ducttape presented here ;-)

Yeah I don't have a problem with it as long as we don't block when
queuing flips in real life. :)
Chris Wilson Nov. 1, 2012, 3:52 p.m. UTC | #5
On Thu, 1 Nov 2012 08:34:47 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> On Thu, 1 Nov 2012 16:29:35 +0100
> Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote:
> > > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > > > On Thu,  1 Nov 2012 09:26:26 +0000
> > > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > 
> > > > > If we accumulate unpin tasks because we are pageflipping faster than the
> > > > > system can schedule its workers, we can effectively create a
> > > > > pin-leak. The solution taken here is to limit the number of unpin tasks
> > > > > we have per-crtc and to flush those outstanding tasks if we accumulate
> > > > > too many. This should prevent any jitter in the normal case, and also
> > > > > prevent the hang if we should run too fast.
> > > > > 
> > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> > > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > ---
> > > > >  drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
> > > > >  drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
> > > > >  2 files changed, 19 insertions(+), 7 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> > > > > index 69b1739..800b195 100644
> > > > > --- a/drivers/gpu/drm/i915/intel_display.c
> > > > > +++ b/drivers/gpu/drm/i915/intel_display.c
> > > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work)
> > > > >  {
> > > > >  	struct intel_unpin_work *work =
> > > > >  		container_of(__work, struct intel_unpin_work, work);
> > > > > +	struct drm_device *dev = work->crtc->dev;
> > > > >  
> > > > > -	mutex_lock(&work->dev->struct_mutex);
> > > > > +	mutex_lock(&dev->struct_mutex);
> > > > >  	intel_unpin_fb_obj(work->old_fb_obj);
> > > > >  	drm_gem_object_unreference(&work->pending_flip_obj->base);
> > > > >  	drm_gem_object_unreference(&work->old_fb_obj->base);
> > > > >  
> > > > > -	intel_update_fbc(work->dev);
> > > > > -	mutex_unlock(&work->dev->struct_mutex);
> > > > > +	intel_update_fbc(dev);
> > > > > +	mutex_unlock(&dev->struct_mutex);
> > > > > +
> > > > > +	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
> > > > > +	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
> > > > > +
> > > > >  	kfree(work);
> > > > >  }
> > > > >  
> > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev,
> > > > >  
> > > > >  	atomic_clear_mask(1 << intel_crtc->plane,
> > > > >  			  &obj->pending_flip.counter);
> > > > > -
> > > > >  	wake_up(&dev_priv->pending_flip_queue);
> > > > > -	schedule_work(&work->work);
> > > > > +
> > > > > +	queue_work(dev_priv->wq, &work->work);
> > > > >  
> > > > >  	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
> > > > >  }
> > > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > > > >  		return -ENOMEM;
> > > > >  
> > > > >  	work->event = event;
> > > > > -	work->dev = crtc->dev;
> > > > > +	work->crtc = crtc;
> > > > >  	intel_fb = to_intel_framebuffer(crtc->fb);
> > > > >  	work->old_fb_obj = intel_fb->obj;
> > > > >  	INIT_WORK(&work->work, intel_unpin_work_fn);
> > > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > > > >  	intel_fb = to_intel_framebuffer(fb);
> > > > >  	obj = intel_fb->obj;
> > > > >  
> > > > > +	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
> > > > > +		flush_workqueue(dev_priv->wq);
> > > > > +
> > > > 
> > > > Have you by chance tested this with the async flip patch?  I wonder if
> > > > in that case whether 2 is too small, and something like 100 might be
> > > > better (though really async flips are for cases where we can't keep up
> > > > with refresh, so a small number shouldn't hurt too much there either).
> > > 
> > > The limit on 2 is due to the limited resolution of pincount. Hence my
> > > earlier fear for your async flip patch.
> > 
> > I think for asyn flips we simply need to have a real flip queue in our
> > code, instead of abusing the implicit list in the workqueue code ...
> > 
> > One other thing is that with async flips we don't have a natural limit on
> > the number of pinned framebuffers any more, which means we can easily
> > exhaust all mappable GTT space. Hence we need to integrate that new,
> > explicit flip queue into our eviction code, too.
> > 
> > For now I'm rather happy with the flush_wq ducttape presented here ;-)
> 
> Yeah I don't have a problem with it as long as we don't block when
> queuing flips in real life. :)

Actually I've justified the blocking here to myself, and prefer it to
simply running the crtc->unpin_work. If userspace is swamping the system
so badly that we can run the kthreads quick enough, it deserves a stall.
Note that the unpin leak is still about the 3rd most common bug in fedora,
so this stall will be forced on many machines.
-Chris
Jesse Barnes Nov. 1, 2012, 4:04 p.m. UTC | #6
On Thu, 01 Nov 2012 15:52:23 +0000
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Thu, 1 Nov 2012 08:34:47 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > On Thu, 1 Nov 2012 16:29:35 +0100
> > Daniel Vetter <daniel@ffwll.ch> wrote:
> > 
> > > On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote:
> > > > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > > > > On Thu,  1 Nov 2012 09:26:26 +0000
> > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > > 
> > > > > > If we accumulate unpin tasks because we are pageflipping faster than the
> > > > > > system can schedule its workers, we can effectively create a
> > > > > > pin-leak. The solution taken here is to limit the number of unpin tasks
> > > > > > we have per-crtc and to flush those outstanding tasks if we accumulate
> > > > > > too many. This should prevent any jitter in the normal case, and also
> > > > > > prevent the hang if we should run too fast.
> > > > > > 
> > > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> > > > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> > > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > > ---
> > > > > >  drivers/gpu/drm/i915/intel_display.c |   22 ++++++++++++++++------
> > > > > >  drivers/gpu/drm/i915/intel_drv.h     |    4 +++-
> > > > > >  2 files changed, 19 insertions(+), 7 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> > > > > > index 69b1739..800b195 100644
> > > > > > --- a/drivers/gpu/drm/i915/intel_display.c
> > > > > > +++ b/drivers/gpu/drm/i915/intel_display.c
> > > > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work)
> > > > > >  {
> > > > > >  	struct intel_unpin_work *work =
> > > > > >  		container_of(__work, struct intel_unpin_work, work);
> > > > > > +	struct drm_device *dev = work->crtc->dev;
> > > > > >  
> > > > > > -	mutex_lock(&work->dev->struct_mutex);
> > > > > > +	mutex_lock(&dev->struct_mutex);
> > > > > >  	intel_unpin_fb_obj(work->old_fb_obj);
> > > > > >  	drm_gem_object_unreference(&work->pending_flip_obj->base);
> > > > > >  	drm_gem_object_unreference(&work->old_fb_obj->base);
> > > > > >  
> > > > > > -	intel_update_fbc(work->dev);
> > > > > > -	mutex_unlock(&work->dev->struct_mutex);
> > > > > > +	intel_update_fbc(dev);
> > > > > > +	mutex_unlock(&dev->struct_mutex);
> > > > > > +
> > > > > > +	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
> > > > > > +	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
> > > > > > +
> > > > > >  	kfree(work);
> > > > > >  }
> > > > > >  
> > > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev,
> > > > > >  
> > > > > >  	atomic_clear_mask(1 << intel_crtc->plane,
> > > > > >  			  &obj->pending_flip.counter);
> > > > > > -
> > > > > >  	wake_up(&dev_priv->pending_flip_queue);
> > > > > > -	schedule_work(&work->work);
> > > > > > +
> > > > > > +	queue_work(dev_priv->wq, &work->work);
> > > > > >  
> > > > > >  	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
> > > > > >  }
> > > > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > > > > >  		return -ENOMEM;
> > > > > >  
> > > > > >  	work->event = event;
> > > > > > -	work->dev = crtc->dev;
> > > > > > +	work->crtc = crtc;
> > > > > >  	intel_fb = to_intel_framebuffer(crtc->fb);
> > > > > >  	work->old_fb_obj = intel_fb->obj;
> > > > > >  	INIT_WORK(&work->work, intel_unpin_work_fn);
> > > > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
> > > > > >  	intel_fb = to_intel_framebuffer(fb);
> > > > > >  	obj = intel_fb->obj;
> > > > > >  
> > > > > > +	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
> > > > > > +		flush_workqueue(dev_priv->wq);
> > > > > > +
> > > > > 
> > > > > Have you by chance tested this with the async flip patch?  I wonder if
> > > > > in that case whether 2 is too small, and something like 100 might be
> > > > > better (though really async flips are for cases where we can't keep up
> > > > > with refresh, so a small number shouldn't hurt too much there either).
> > > > 
> > > > The limit on 2 is due to the limited resolution of pincount. Hence my
> > > > earlier fear for your async flip patch.
> > > 
> > > I think for asyn flips we simply need to have a real flip queue in our
> > > code, instead of abusing the implicit list in the workqueue code ...
> > > 
> > > One other thing is that with async flips we don't have a natural limit on
> > > the number of pinned framebuffers any more, which means we can easily
> > > exhaust all mappable GTT space. Hence we need to integrate that new,
> > > explicit flip queue into our eviction code, too.
> > > 
> > > For now I'm rather happy with the flush_wq ducttape presented here ;-)
> > 
> > Yeah I don't have a problem with it as long as we don't block when
> > queuing flips in real life. :)
> 
> Actually I've justified the blocking here to myself, and prefer it to
> simply running the crtc->unpin_work. If userspace is swamping the system
> so badly that we can run the kthreads quick enough, it deserves a stall.
> Note that the unpin leak is still about the 3rd most common bug in fedora,
> so this stall will be forced on many machines.

Hm funky, why does Fedora hit it so much?  Does some of the GNOME shell
stuff run unthrottled or something?
Chris Wilson Nov. 1, 2012, 4:20 p.m. UTC | #7
On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> On Thu, 01 Nov 2012 15:52:23 +0000
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > Actually I've justified the blocking here to myself, and prefer it to
> > simply running the crtc->unpin_work. If userspace is swamping the system
> > so badly that we can run the kthreads quick enough, it deserves a stall.
> > Note that the unpin leak is still about the 3rd most common bug in fedora,
> > so this stall will be forced on many machines.
> 
> Hm funky, why does Fedora hit it so much?  Does some of the GNOME shell
> stuff run unthrottled or something?

I don't think so. I trust that in Tvrtko's use case, he is not so much as
hogging the GPU as keeping the system as a whole relatively busy. So I
suspect it is more to do with CPU starvation of the kthreads than
anything else.

Tvrtko, do you have any feeling for why your machine was easily
suspectible to this leak? Are the stalls noticeable and do they affect
your performance targets?
-Chris
Tvrtko Ursulin Nov. 1, 2012, 4:52 p.m. UTC | #8
On Thursday 01 November 2012 16:20:03 Chris Wilson wrote:
> On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> 
wrote:
> > On Thu, 01 Nov 2012 15:52:23 +0000
> > 
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > Actually I've justified the blocking here to myself, and prefer it to
> > > simply running the crtc->unpin_work. If userspace is swamping the system
> > > so badly that we can run the kthreads quick enough, it deserves a stall.
> > > Note that the unpin leak is still about the 3rd most common bug in
> > > fedora,
> > > so this stall will be forced on many machines.
> > 
> > Hm funky, why does Fedora hit it so much?  Does some of the GNOME shell
> > stuff run unthrottled or something?
> 
> I don't think so. I trust that in Tvrtko's use case, he is not so much as
> hogging the GPU as keeping the system as a whole relatively busy. So I
> suspect it is more to do with CPU starvation of the kthreads than
> anything else.
> 
> Tvrtko, do you have any feeling for why your machine was easily
> suspectible to this leak? Are the stalls noticeable and do they affect
> your performance targets?

We didn't bother looking for any stalls, but for a long time we were 
occasionally hitting this pin_count BUG i915_gem_object_pin. So it didn't in 
fact affect our performance targets as much it completely wrecked our system.

If this patch causes an occasional stall instead, given that this bug triggers 
every 3-4 hours of uptime, we are fine with that. If a frame or so is missed 
every couple hours on low end hardware we don't care that much.

More on the actual workload...

Only recently we got lucky and found a platform and workload where it happens 
reliably. And this patch reliably fixes that.

In this workload CPU is being loaded 50-60% decoding a movie and rendering it 
to a full screen window. Our proprietary compositor page flips at 60Hz only, 
not faster. Together with another small semi-transparent window being rendered 
on top of the full screen movie. Movie played is a 25fps one, which means the 
full screen window is damaged 25 out of 60 frames (give or take) which is when 
we render to our back buffer and page flip at the vsync rate (60Hz).

According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the 
"Framebuffer Compression" metric which is maxed out, if that is one is at all 
valid.

This particular scenario triggers the bug only on two of our Atom based 
platform both with a NM10/Pineview G/i915 chipset.

Tvrtko
Jesse Barnes Nov. 1, 2012, 4:58 p.m. UTC | #9
On Thu, 01 Nov 2012 16:52:05 +0000
Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> wrote:

> On Thursday 01 November 2012 16:20:03 Chris Wilson wrote:
> > On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> 
> wrote:
> > > On Thu, 01 Nov 2012 15:52:23 +0000
> > > 
> > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > Actually I've justified the blocking here to myself, and prefer it to
> > > > simply running the crtc->unpin_work. If userspace is swamping the system
> > > > so badly that we can run the kthreads quick enough, it deserves a stall.
> > > > Note that the unpin leak is still about the 3rd most common bug in
> > > > fedora,
> > > > so this stall will be forced on many machines.
> > > 
> > > Hm funky, why does Fedora hit it so much?  Does some of the GNOME shell
> > > stuff run unthrottled or something?
> > 
> > I don't think so. I trust that in Tvrtko's use case, he is not so much as
> > hogging the GPU as keeping the system as a whole relatively busy. So I
> > suspect it is more to do with CPU starvation of the kthreads than
> > anything else.
> > 
> > Tvrtko, do you have any feeling for why your machine was easily
> > suspectible to this leak? Are the stalls noticeable and do they affect
> > your performance targets?
> 
> We didn't bother looking for any stalls, but for a long time we were 
> occasionally hitting this pin_count BUG i915_gem_object_pin. So it didn't in 
> fact affect our performance targets as much it completely wrecked our system.
> 
> If this patch causes an occasional stall instead, given that this bug triggers 
> every 3-4 hours of uptime, we are fine with that. If a frame or so is missed 
> every couple hours on low end hardware we don't care that much.
> 
> More on the actual workload...
> 
> Only recently we got lucky and found a platform and workload where it happens 
> reliably. And this patch reliably fixes that.
> 
> In this workload CPU is being loaded 50-60% decoding a movie and rendering it 
> to a full screen window. Our proprietary compositor page flips at 60Hz only, 
> not faster. Together with another small semi-transparent window being rendered 
> on top of the full screen movie. Movie played is a 25fps one, which means the 
> full screen window is damaged 25 out of 60 frames (give or take) which is when 
> we render to our back buffer and page flip at the vsync rate (60Hz).
> 
> According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the 
> "Framebuffer Compression" metric which is maxed out, if that is one is at all 
> valid.
> 
> This particular scenario triggers the bug only on two of our Atom based 
> platform both with a NM10/Pineview G/i915 chipset.

Ah ok on Atom you're probably CPU constrained a bit, but still at
50-60% utilization the kthreads should be running at least sometimes...

But it sounds like a case of the kthreads not running instead of
queueing too fast anyway (not that the latter is really possible
without some hacking to the flip code).
Eric Anholt Nov. 2, 2012, 9:31 p.m. UTC | #10
Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> writes:
> According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the 
> "Framebuffer Compression" metric which is maxed out, if that is one is at all 
> valid.

Often a bit is not actually hooked up to anything, in which case it will
be interpreted as 100% busy.  We should probably turn those off on those
specific chips, but it's not a very well maintained tool (because it's
not a super useful tool, unfortunately).
Simon Farnsworth Nov. 5, 2012, 11:36 a.m. UTC | #11
On Thursday 1 November 2012 09:58:51 Jesse Barnes wrote:
> On Thu, 01 Nov 2012 16:52:05 +0000
> Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> wrote:
> 
> > On Thursday 01 November 2012 16:20:03 Chris Wilson wrote:
> > > On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> 
> > wrote:
> > > > On Thu, 01 Nov 2012 15:52:23 +0000
> > > > 
> > > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > > Actually I've justified the blocking here to myself, and prefer it to
> > > > > simply running the crtc->unpin_work. If userspace is swamping the system
> > > > > so badly that we can run the kthreads quick enough, it deserves a stall.
> > > > > Note that the unpin leak is still about the 3rd most common bug in
> > > > > fedora,
> > > > > so this stall will be forced on many machines.
> > > > 
> > > > Hm funky, why does Fedora hit it so much?  Does some of the GNOME shell
> > > > stuff run unthrottled or something?
> > > 
> > > I don't think so. I trust that in Tvrtko's use case, he is not so much as
> > > hogging the GPU as keeping the system as a whole relatively busy. So I
> > > suspect it is more to do with CPU starvation of the kthreads than
> > > anything else.
> > > 
> > > Tvrtko, do you have any feeling for why your machine was easily
> > > suspectible to this leak? Are the stalls noticeable and do they affect
> > > your performance targets?
> > 
> > We didn't bother looking for any stalls, but for a long time we were 
> > occasionally hitting this pin_count BUG i915_gem_object_pin. So it didn't in 
> > fact affect our performance targets as much it completely wrecked our system.
> > 
> > If this patch causes an occasional stall instead, given that this bug triggers 
> > every 3-4 hours of uptime, we are fine with that. If a frame or so is missed 
> > every couple hours on low end hardware we don't care that much.
> > 
> > More on the actual workload...
> > 
> > Only recently we got lucky and found a platform and workload where it happens 
> > reliably. And this patch reliably fixes that.
> > 
> > In this workload CPU is being loaded 50-60% decoding a movie and rendering it 
> > to a full screen window. Our proprietary compositor page flips at 60Hz only, 
> > not faster. Together with another small semi-transparent window being rendered 
> > on top of the full screen movie. Movie played is a 25fps one, which means the 
> > full screen window is damaged 25 out of 60 frames (give or take) which is when 
> > we render to our back buffer and page flip at the vsync rate (60Hz).
> > 
> > According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the 
> > "Framebuffer Compression" metric which is maxed out, if that is one is at all 
> > valid.
> > 
> > This particular scenario triggers the bug only on two of our Atom based 
> > platform both with a NM10/Pineview G/i915 chipset.
> 
> Ah ok on Atom you're probably CPU constrained a bit, but still at
> 50-60% utilization the kthreads should be running at least sometimes...
> 
> But it sounds like a case of the kthreads not running instead of
> queueing too fast anyway (not that the latter is really possible
> without some hacking to the flip code).
> 
It may help you here to know that we run both our compositor and the X server
at real-time priorities - both are SCHED_RR static priority 1 (the lowest
realtime priority). IIRC, the kthreads run at SCHED_OTHER priority, so we are
quite capable of starving them during a burst of activity.
Daniel Vetter Nov. 20, 2012, 4:15 p.m. UTC | #12
On Thu, Nov 01, 2012 at 09:26:26AM +0000, Chris Wilson wrote:
> If we accumulate unpin tasks because we are pageflipping faster than the
> system can schedule its workers, we can effectively create a
> pin-leak. The solution taken here is to limit the number of unpin tasks
> we have per-crtc and to flush those outstanding tasks if we accumulate
> too many. This should prevent any jitter in the normal case, and also
> prevent the hang if we should run too fast.
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991
> Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Queued for -next with a note added to the commit message about the
workqueue related deadlock. Thanks for the patch.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 69b1739..800b195 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -6908,14 +6908,19 @@  static void intel_unpin_work_fn(struct work_struct *__work)
 {
 	struct intel_unpin_work *work =
 		container_of(__work, struct intel_unpin_work, work);
+	struct drm_device *dev = work->crtc->dev;
 
-	mutex_lock(&work->dev->struct_mutex);
+	mutex_lock(&dev->struct_mutex);
 	intel_unpin_fb_obj(work->old_fb_obj);
 	drm_gem_object_unreference(&work->pending_flip_obj->base);
 	drm_gem_object_unreference(&work->old_fb_obj->base);
 
-	intel_update_fbc(work->dev);
-	mutex_unlock(&work->dev->struct_mutex);
+	intel_update_fbc(dev);
+	mutex_unlock(&dev->struct_mutex);
+
+	BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0);
+	atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count);
+
 	kfree(work);
 }
 
@@ -6963,9 +6968,9 @@  static void do_intel_finish_page_flip(struct drm_device *dev,
 
 	atomic_clear_mask(1 << intel_crtc->plane,
 			  &obj->pending_flip.counter);
-
 	wake_up(&dev_priv->pending_flip_queue);
-	schedule_work(&work->work);
+
+	queue_work(dev_priv->wq, &work->work);
 
 	trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj);
 }
@@ -7266,7 +7271,7 @@  static int intel_crtc_page_flip(struct drm_crtc *crtc,
 		return -ENOMEM;
 
 	work->event = event;
-	work->dev = crtc->dev;
+	work->crtc = crtc;
 	intel_fb = to_intel_framebuffer(crtc->fb);
 	work->old_fb_obj = intel_fb->obj;
 	INIT_WORK(&work->work, intel_unpin_work_fn);
@@ -7291,6 +7296,9 @@  static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	intel_fb = to_intel_framebuffer(fb);
 	obj = intel_fb->obj;
 
+	if (atomic_read(&intel_crtc->unpin_work_count) >= 2)
+		flush_workqueue(dev_priv->wq);
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		goto cleanup;
@@ -7309,6 +7317,7 @@  static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	 * the flip occurs and the object is no longer visible.
 	 */
 	atomic_add(1 << intel_crtc->plane, &work->old_fb_obj->pending_flip);
+	atomic_inc(&intel_crtc->unpin_work_count);
 
 	ret = dev_priv->display.queue_flip(dev, crtc, fb, obj);
 	if (ret)
@@ -7323,6 +7332,7 @@  static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	return 0;
 
 cleanup_pending:
+	atomic_dec(&intel_crtc->unpin_work_count);
 	atomic_sub(1 << intel_crtc->plane, &work->old_fb_obj->pending_flip);
 	drm_gem_object_unreference(&work->old_fb_obj->base);
 	drm_gem_object_unreference(&obj->base);
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 164696f..1345c44 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -216,6 +216,8 @@  struct intel_crtc {
 	} vblank_work;
 	int fdi_lanes;
 
+	atomic_t unpin_work_count;
+
 	/* Display surface base address adjustement for pageflips. Note that on
 	 * gen4+ this only adjusts up to a tile, offsets within a tile are
 	 * handled in the hw itself (with the TILEOFF register). */
@@ -403,7 +405,7 @@  intel_get_crtc_for_plane(struct drm_device *dev, int plane)
 
 struct intel_unpin_work {
 	struct work_struct work;
-	struct drm_device *dev;
+	struct drm_crtc *crtc;
 	struct drm_i915_gem_object *old_fb_obj;
 	struct drm_i915_gem_object *pending_flip_obj;
 	struct drm_pending_vblank_event *event;