Message ID | 1351761986-27982-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, 1 Nov 2012 09:26:26 +0000 Chris Wilson <chris@chris-wilson.co.uk> wrote: > If we accumulate unpin tasks because we are pageflipping faster than the > system can schedule its workers, we can effectively create a > pin-leak. The solution taken here is to limit the number of unpin tasks > we have per-crtc and to flush those outstanding tasks if we accumulate > too many. This should prevent any jitter in the normal case, and also > prevent the hang if we should run too fast. > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ > drivers/gpu/drm/i915/intel_drv.h | 4 +++- > 2 files changed, 19 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > index 69b1739..800b195 100644 > --- a/drivers/gpu/drm/i915/intel_display.c > +++ b/drivers/gpu/drm/i915/intel_display.c > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) > { > struct intel_unpin_work *work = > container_of(__work, struct intel_unpin_work, work); > + struct drm_device *dev = work->crtc->dev; > > - mutex_lock(&work->dev->struct_mutex); > + mutex_lock(&dev->struct_mutex); > intel_unpin_fb_obj(work->old_fb_obj); > drm_gem_object_unreference(&work->pending_flip_obj->base); > drm_gem_object_unreference(&work->old_fb_obj->base); > > - intel_update_fbc(work->dev); > - mutex_unlock(&work->dev->struct_mutex); > + intel_update_fbc(dev); > + mutex_unlock(&dev->struct_mutex); > + > + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); > + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); > + > kfree(work); > } > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, > > atomic_clear_mask(1 << intel_crtc->plane, > &obj->pending_flip.counter); > - > wake_up(&dev_priv->pending_flip_queue); > - schedule_work(&work->work); > + > + queue_work(dev_priv->wq, &work->work); > > trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); > } > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > return -ENOMEM; > > work->event = event; > - work->dev = crtc->dev; > + work->crtc = crtc; > intel_fb = to_intel_framebuffer(crtc->fb); > work->old_fb_obj = intel_fb->obj; > INIT_WORK(&work->work, intel_unpin_work_fn); > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > intel_fb = to_intel_framebuffer(fb); > obj = intel_fb->obj; > > + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) > + flush_workqueue(dev_priv->wq); > + Have you by chance tested this with the async flip patch? I wonder if in that case whether 2 is too small, and something like 100 might be better (though really async flips are for cases where we can't keep up with refresh, so a small number shouldn't hurt too much there either).
On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > On Thu, 1 Nov 2012 09:26:26 +0000 > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > If we accumulate unpin tasks because we are pageflipping faster than the > > system can schedule its workers, we can effectively create a > > pin-leak. The solution taken here is to limit the number of unpin tasks > > we have per-crtc and to flush those outstanding tasks if we accumulate > > too many. This should prevent any jitter in the normal case, and also > > prevent the hang if we should run too fast. > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > --- > > drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ > > drivers/gpu/drm/i915/intel_drv.h | 4 +++- > > 2 files changed, 19 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > > index 69b1739..800b195 100644 > > --- a/drivers/gpu/drm/i915/intel_display.c > > +++ b/drivers/gpu/drm/i915/intel_display.c > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) > > { > > struct intel_unpin_work *work = > > container_of(__work, struct intel_unpin_work, work); > > + struct drm_device *dev = work->crtc->dev; > > > > - mutex_lock(&work->dev->struct_mutex); > > + mutex_lock(&dev->struct_mutex); > > intel_unpin_fb_obj(work->old_fb_obj); > > drm_gem_object_unreference(&work->pending_flip_obj->base); > > drm_gem_object_unreference(&work->old_fb_obj->base); > > > > - intel_update_fbc(work->dev); > > - mutex_unlock(&work->dev->struct_mutex); > > + intel_update_fbc(dev); > > + mutex_unlock(&dev->struct_mutex); > > + > > + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); > > + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); > > + > > kfree(work); > > } > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, > > > > atomic_clear_mask(1 << intel_crtc->plane, > > &obj->pending_flip.counter); > > - > > wake_up(&dev_priv->pending_flip_queue); > > - schedule_work(&work->work); > > + > > + queue_work(dev_priv->wq, &work->work); > > > > trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); > > } > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > return -ENOMEM; > > > > work->event = event; > > - work->dev = crtc->dev; > > + work->crtc = crtc; > > intel_fb = to_intel_framebuffer(crtc->fb); > > work->old_fb_obj = intel_fb->obj; > > INIT_WORK(&work->work, intel_unpin_work_fn); > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > intel_fb = to_intel_framebuffer(fb); > > obj = intel_fb->obj; > > > > + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) > > + flush_workqueue(dev_priv->wq); > > + > > Have you by chance tested this with the async flip patch? I wonder if > in that case whether 2 is too small, and something like 100 might be > better (though really async flips are for cases where we can't keep up > with refresh, so a small number shouldn't hurt too much there either). The limit on 2 is due to the limited resolution of pincount. Hence my earlier fear for your async flip patch. -Chris
On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote: > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > On Thu, 1 Nov 2012 09:26:26 +0000 > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > > If we accumulate unpin tasks because we are pageflipping faster than the > > > system can schedule its workers, we can effectively create a > > > pin-leak. The solution taken here is to limit the number of unpin tasks > > > we have per-crtc and to flush those outstanding tasks if we accumulate > > > too many. This should prevent any jitter in the normal case, and also > > > prevent the hang if we should run too fast. > > > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > --- > > > drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ > > > drivers/gpu/drm/i915/intel_drv.h | 4 +++- > > > 2 files changed, 19 insertions(+), 7 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > > > index 69b1739..800b195 100644 > > > --- a/drivers/gpu/drm/i915/intel_display.c > > > +++ b/drivers/gpu/drm/i915/intel_display.c > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) > > > { > > > struct intel_unpin_work *work = > > > container_of(__work, struct intel_unpin_work, work); > > > + struct drm_device *dev = work->crtc->dev; > > > > > > - mutex_lock(&work->dev->struct_mutex); > > > + mutex_lock(&dev->struct_mutex); > > > intel_unpin_fb_obj(work->old_fb_obj); > > > drm_gem_object_unreference(&work->pending_flip_obj->base); > > > drm_gem_object_unreference(&work->old_fb_obj->base); > > > > > > - intel_update_fbc(work->dev); > > > - mutex_unlock(&work->dev->struct_mutex); > > > + intel_update_fbc(dev); > > > + mutex_unlock(&dev->struct_mutex); > > > + > > > + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); > > > + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); > > > + > > > kfree(work); > > > } > > > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, > > > > > > atomic_clear_mask(1 << intel_crtc->plane, > > > &obj->pending_flip.counter); > > > - > > > wake_up(&dev_priv->pending_flip_queue); > > > - schedule_work(&work->work); > > > + > > > + queue_work(dev_priv->wq, &work->work); > > > > > > trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); > > > } > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > return -ENOMEM; > > > > > > work->event = event; > > > - work->dev = crtc->dev; > > > + work->crtc = crtc; > > > intel_fb = to_intel_framebuffer(crtc->fb); > > > work->old_fb_obj = intel_fb->obj; > > > INIT_WORK(&work->work, intel_unpin_work_fn); > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > intel_fb = to_intel_framebuffer(fb); > > > obj = intel_fb->obj; > > > > > > + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) > > > + flush_workqueue(dev_priv->wq); > > > + > > > > Have you by chance tested this with the async flip patch? I wonder if > > in that case whether 2 is too small, and something like 100 might be > > better (though really async flips are for cases where we can't keep up > > with refresh, so a small number shouldn't hurt too much there either). > > The limit on 2 is due to the limited resolution of pincount. Hence my > earlier fear for your async flip patch. I think for asyn flips we simply need to have a real flip queue in our code, instead of abusing the implicit list in the workqueue code ... One other thing is that with async flips we don't have a natural limit on the number of pinned framebuffers any more, which means we can easily exhaust all mappable GTT space. Hence we need to integrate that new, explicit flip queue into our eviction code, too. For now I'm rather happy with the flush_wq ducttape presented here ;-) Cheers, Daniel
On Thu, 1 Nov 2012 16:29:35 +0100 Daniel Vetter <daniel@ffwll.ch> wrote: > On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote: > > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > > On Thu, 1 Nov 2012 09:26:26 +0000 > > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > > > > If we accumulate unpin tasks because we are pageflipping faster than the > > > > system can schedule its workers, we can effectively create a > > > > pin-leak. The solution taken here is to limit the number of unpin tasks > > > > we have per-crtc and to flush those outstanding tasks if we accumulate > > > > too many. This should prevent any jitter in the normal case, and also > > > > prevent the hang if we should run too fast. > > > > > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > > --- > > > > drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ > > > > drivers/gpu/drm/i915/intel_drv.h | 4 +++- > > > > 2 files changed, 19 insertions(+), 7 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > > > > index 69b1739..800b195 100644 > > > > --- a/drivers/gpu/drm/i915/intel_display.c > > > > +++ b/drivers/gpu/drm/i915/intel_display.c > > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) > > > > { > > > > struct intel_unpin_work *work = > > > > container_of(__work, struct intel_unpin_work, work); > > > > + struct drm_device *dev = work->crtc->dev; > > > > > > > > - mutex_lock(&work->dev->struct_mutex); > > > > + mutex_lock(&dev->struct_mutex); > > > > intel_unpin_fb_obj(work->old_fb_obj); > > > > drm_gem_object_unreference(&work->pending_flip_obj->base); > > > > drm_gem_object_unreference(&work->old_fb_obj->base); > > > > > > > > - intel_update_fbc(work->dev); > > > > - mutex_unlock(&work->dev->struct_mutex); > > > > + intel_update_fbc(dev); > > > > + mutex_unlock(&dev->struct_mutex); > > > > + > > > > + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); > > > > + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); > > > > + > > > > kfree(work); > > > > } > > > > > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, > > > > > > > > atomic_clear_mask(1 << intel_crtc->plane, > > > > &obj->pending_flip.counter); > > > > - > > > > wake_up(&dev_priv->pending_flip_queue); > > > > - schedule_work(&work->work); > > > > + > > > > + queue_work(dev_priv->wq, &work->work); > > > > > > > > trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); > > > > } > > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > > return -ENOMEM; > > > > > > > > work->event = event; > > > > - work->dev = crtc->dev; > > > > + work->crtc = crtc; > > > > intel_fb = to_intel_framebuffer(crtc->fb); > > > > work->old_fb_obj = intel_fb->obj; > > > > INIT_WORK(&work->work, intel_unpin_work_fn); > > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > > intel_fb = to_intel_framebuffer(fb); > > > > obj = intel_fb->obj; > > > > > > > > + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) > > > > + flush_workqueue(dev_priv->wq); > > > > + > > > > > > Have you by chance tested this with the async flip patch? I wonder if > > > in that case whether 2 is too small, and something like 100 might be > > > better (though really async flips are for cases where we can't keep up > > > with refresh, so a small number shouldn't hurt too much there either). > > > > The limit on 2 is due to the limited resolution of pincount. Hence my > > earlier fear for your async flip patch. > > I think for asyn flips we simply need to have a real flip queue in our > code, instead of abusing the implicit list in the workqueue code ... > > One other thing is that with async flips we don't have a natural limit on > the number of pinned framebuffers any more, which means we can easily > exhaust all mappable GTT space. Hence we need to integrate that new, > explicit flip queue into our eviction code, too. > > For now I'm rather happy with the flush_wq ducttape presented here ;-) Yeah I don't have a problem with it as long as we don't block when queuing flips in real life. :)
On Thu, 1 Nov 2012 08:34:47 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > On Thu, 1 Nov 2012 16:29:35 +0100 > Daniel Vetter <daniel@ffwll.ch> wrote: > > > On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote: > > > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > > > On Thu, 1 Nov 2012 09:26:26 +0000 > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > > > > > > If we accumulate unpin tasks because we are pageflipping faster than the > > > > > system can schedule its workers, we can effectively create a > > > > > pin-leak. The solution taken here is to limit the number of unpin tasks > > > > > we have per-crtc and to flush those outstanding tasks if we accumulate > > > > > too many. This should prevent any jitter in the normal case, and also > > > > > prevent the hang if we should run too fast. > > > > > > > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > > > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > > > --- > > > > > drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ > > > > > drivers/gpu/drm/i915/intel_drv.h | 4 +++- > > > > > 2 files changed, 19 insertions(+), 7 deletions(-) > > > > > > > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > > > > > index 69b1739..800b195 100644 > > > > > --- a/drivers/gpu/drm/i915/intel_display.c > > > > > +++ b/drivers/gpu/drm/i915/intel_display.c > > > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) > > > > > { > > > > > struct intel_unpin_work *work = > > > > > container_of(__work, struct intel_unpin_work, work); > > > > > + struct drm_device *dev = work->crtc->dev; > > > > > > > > > > - mutex_lock(&work->dev->struct_mutex); > > > > > + mutex_lock(&dev->struct_mutex); > > > > > intel_unpin_fb_obj(work->old_fb_obj); > > > > > drm_gem_object_unreference(&work->pending_flip_obj->base); > > > > > drm_gem_object_unreference(&work->old_fb_obj->base); > > > > > > > > > > - intel_update_fbc(work->dev); > > > > > - mutex_unlock(&work->dev->struct_mutex); > > > > > + intel_update_fbc(dev); > > > > > + mutex_unlock(&dev->struct_mutex); > > > > > + > > > > > + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); > > > > > + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); > > > > > + > > > > > kfree(work); > > > > > } > > > > > > > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, > > > > > > > > > > atomic_clear_mask(1 << intel_crtc->plane, > > > > > &obj->pending_flip.counter); > > > > > - > > > > > wake_up(&dev_priv->pending_flip_queue); > > > > > - schedule_work(&work->work); > > > > > + > > > > > + queue_work(dev_priv->wq, &work->work); > > > > > > > > > > trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); > > > > > } > > > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > > > return -ENOMEM; > > > > > > > > > > work->event = event; > > > > > - work->dev = crtc->dev; > > > > > + work->crtc = crtc; > > > > > intel_fb = to_intel_framebuffer(crtc->fb); > > > > > work->old_fb_obj = intel_fb->obj; > > > > > INIT_WORK(&work->work, intel_unpin_work_fn); > > > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > > > intel_fb = to_intel_framebuffer(fb); > > > > > obj = intel_fb->obj; > > > > > > > > > > + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) > > > > > + flush_workqueue(dev_priv->wq); > > > > > + > > > > > > > > Have you by chance tested this with the async flip patch? I wonder if > > > > in that case whether 2 is too small, and something like 100 might be > > > > better (though really async flips are for cases where we can't keep up > > > > with refresh, so a small number shouldn't hurt too much there either). > > > > > > The limit on 2 is due to the limited resolution of pincount. Hence my > > > earlier fear for your async flip patch. > > > > I think for asyn flips we simply need to have a real flip queue in our > > code, instead of abusing the implicit list in the workqueue code ... > > > > One other thing is that with async flips we don't have a natural limit on > > the number of pinned framebuffers any more, which means we can easily > > exhaust all mappable GTT space. Hence we need to integrate that new, > > explicit flip queue into our eviction code, too. > > > > For now I'm rather happy with the flush_wq ducttape presented here ;-) > > Yeah I don't have a problem with it as long as we don't block when > queuing flips in real life. :) Actually I've justified the blocking here to myself, and prefer it to simply running the crtc->unpin_work. If userspace is swamping the system so badly that we can run the kthreads quick enough, it deserves a stall. Note that the unpin leak is still about the 3rd most common bug in fedora, so this stall will be forced on many machines. -Chris
On Thu, 01 Nov 2012 15:52:23 +0000 Chris Wilson <chris@chris-wilson.co.uk> wrote: > On Thu, 1 Nov 2012 08:34:47 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > On Thu, 1 Nov 2012 16:29:35 +0100 > > Daniel Vetter <daniel@ffwll.ch> wrote: > > > > > On Thu, Nov 01, 2012 at 03:18:46PM +0000, Chris Wilson wrote: > > > > On Thu, 1 Nov 2012 08:07:59 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > > > > On Thu, 1 Nov 2012 09:26:26 +0000 > > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > > > > > > > > If we accumulate unpin tasks because we are pageflipping faster than the > > > > > > system can schedule its workers, we can effectively create a > > > > > > pin-leak. The solution taken here is to limit the number of unpin tasks > > > > > > we have per-crtc and to flush those outstanding tasks if we accumulate > > > > > > too many. This should prevent any jitter in the normal case, and also > > > > > > prevent the hang if we should run too fast. > > > > > > > > > > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > > > > > > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > > > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > > > > --- > > > > > > drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ > > > > > > drivers/gpu/drm/i915/intel_drv.h | 4 +++- > > > > > > 2 files changed, 19 insertions(+), 7 deletions(-) > > > > > > > > > > > > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > > > > > > index 69b1739..800b195 100644 > > > > > > --- a/drivers/gpu/drm/i915/intel_display.c > > > > > > +++ b/drivers/gpu/drm/i915/intel_display.c > > > > > > @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) > > > > > > { > > > > > > struct intel_unpin_work *work = > > > > > > container_of(__work, struct intel_unpin_work, work); > > > > > > + struct drm_device *dev = work->crtc->dev; > > > > > > > > > > > > - mutex_lock(&work->dev->struct_mutex); > > > > > > + mutex_lock(&dev->struct_mutex); > > > > > > intel_unpin_fb_obj(work->old_fb_obj); > > > > > > drm_gem_object_unreference(&work->pending_flip_obj->base); > > > > > > drm_gem_object_unreference(&work->old_fb_obj->base); > > > > > > > > > > > > - intel_update_fbc(work->dev); > > > > > > - mutex_unlock(&work->dev->struct_mutex); > > > > > > + intel_update_fbc(dev); > > > > > > + mutex_unlock(&dev->struct_mutex); > > > > > > + > > > > > > + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); > > > > > > + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); > > > > > > + > > > > > > kfree(work); > > > > > > } > > > > > > > > > > > > @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, > > > > > > > > > > > > atomic_clear_mask(1 << intel_crtc->plane, > > > > > > &obj->pending_flip.counter); > > > > > > - > > > > > > wake_up(&dev_priv->pending_flip_queue); > > > > > > - schedule_work(&work->work); > > > > > > + > > > > > > + queue_work(dev_priv->wq, &work->work); > > > > > > > > > > > > trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); > > > > > > } > > > > > > @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > > > > return -ENOMEM; > > > > > > > > > > > > work->event = event; > > > > > > - work->dev = crtc->dev; > > > > > > + work->crtc = crtc; > > > > > > intel_fb = to_intel_framebuffer(crtc->fb); > > > > > > work->old_fb_obj = intel_fb->obj; > > > > > > INIT_WORK(&work->work, intel_unpin_work_fn); > > > > > > @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, > > > > > > intel_fb = to_intel_framebuffer(fb); > > > > > > obj = intel_fb->obj; > > > > > > > > > > > > + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) > > > > > > + flush_workqueue(dev_priv->wq); > > > > > > + > > > > > > > > > > Have you by chance tested this with the async flip patch? I wonder if > > > > > in that case whether 2 is too small, and something like 100 might be > > > > > better (though really async flips are for cases where we can't keep up > > > > > with refresh, so a small number shouldn't hurt too much there either). > > > > > > > > The limit on 2 is due to the limited resolution of pincount. Hence my > > > > earlier fear for your async flip patch. > > > > > > I think for asyn flips we simply need to have a real flip queue in our > > > code, instead of abusing the implicit list in the workqueue code ... > > > > > > One other thing is that with async flips we don't have a natural limit on > > > the number of pinned framebuffers any more, which means we can easily > > > exhaust all mappable GTT space. Hence we need to integrate that new, > > > explicit flip queue into our eviction code, too. > > > > > > For now I'm rather happy with the flush_wq ducttape presented here ;-) > > > > Yeah I don't have a problem with it as long as we don't block when > > queuing flips in real life. :) > > Actually I've justified the blocking here to myself, and prefer it to > simply running the crtc->unpin_work. If userspace is swamping the system > so badly that we can run the kthreads quick enough, it deserves a stall. > Note that the unpin leak is still about the 3rd most common bug in fedora, > so this stall will be forced on many machines. Hm funky, why does Fedora hit it so much? Does some of the GNOME shell stuff run unthrottled or something?
On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > On Thu, 01 Nov 2012 15:52:23 +0000 > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > Actually I've justified the blocking here to myself, and prefer it to > > simply running the crtc->unpin_work. If userspace is swamping the system > > so badly that we can run the kthreads quick enough, it deserves a stall. > > Note that the unpin leak is still about the 3rd most common bug in fedora, > > so this stall will be forced on many machines. > > Hm funky, why does Fedora hit it so much? Does some of the GNOME shell > stuff run unthrottled or something? I don't think so. I trust that in Tvrtko's use case, he is not so much as hogging the GPU as keeping the system as a whole relatively busy. So I suspect it is more to do with CPU starvation of the kthreads than anything else. Tvrtko, do you have any feeling for why your machine was easily suspectible to this leak? Are the stalls noticeable and do they affect your performance targets? -Chris
On Thursday 01 November 2012 16:20:03 Chris Wilson wrote: > On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> wrote: > > On Thu, 01 Nov 2012 15:52:23 +0000 > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > Actually I've justified the blocking here to myself, and prefer it to > > > simply running the crtc->unpin_work. If userspace is swamping the system > > > so badly that we can run the kthreads quick enough, it deserves a stall. > > > Note that the unpin leak is still about the 3rd most common bug in > > > fedora, > > > so this stall will be forced on many machines. > > > > Hm funky, why does Fedora hit it so much? Does some of the GNOME shell > > stuff run unthrottled or something? > > I don't think so. I trust that in Tvrtko's use case, he is not so much as > hogging the GPU as keeping the system as a whole relatively busy. So I > suspect it is more to do with CPU starvation of the kthreads than > anything else. > > Tvrtko, do you have any feeling for why your machine was easily > suspectible to this leak? Are the stalls noticeable and do they affect > your performance targets? We didn't bother looking for any stalls, but for a long time we were occasionally hitting this pin_count BUG i915_gem_object_pin. So it didn't in fact affect our performance targets as much it completely wrecked our system. If this patch causes an occasional stall instead, given that this bug triggers every 3-4 hours of uptime, we are fine with that. If a frame or so is missed every couple hours on low end hardware we don't care that much. More on the actual workload... Only recently we got lucky and found a platform and workload where it happens reliably. And this patch reliably fixes that. In this workload CPU is being loaded 50-60% decoding a movie and rendering it to a full screen window. Our proprietary compositor page flips at 60Hz only, not faster. Together with another small semi-transparent window being rendered on top of the full screen movie. Movie played is a 25fps one, which means the full screen window is damaged 25 out of 60 frames (give or take) which is when we render to our back buffer and page flip at the vsync rate (60Hz). According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the "Framebuffer Compression" metric which is maxed out, if that is one is at all valid. This particular scenario triggers the bug only on two of our Atom based platform both with a NM10/Pineview G/i915 chipset. Tvrtko
On Thu, 01 Nov 2012 16:52:05 +0000 Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> wrote: > On Thursday 01 November 2012 16:20:03 Chris Wilson wrote: > > On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> > wrote: > > > On Thu, 01 Nov 2012 15:52:23 +0000 > > > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > Actually I've justified the blocking here to myself, and prefer it to > > > > simply running the crtc->unpin_work. If userspace is swamping the system > > > > so badly that we can run the kthreads quick enough, it deserves a stall. > > > > Note that the unpin leak is still about the 3rd most common bug in > > > > fedora, > > > > so this stall will be forced on many machines. > > > > > > Hm funky, why does Fedora hit it so much? Does some of the GNOME shell > > > stuff run unthrottled or something? > > > > I don't think so. I trust that in Tvrtko's use case, he is not so much as > > hogging the GPU as keeping the system as a whole relatively busy. So I > > suspect it is more to do with CPU starvation of the kthreads than > > anything else. > > > > Tvrtko, do you have any feeling for why your machine was easily > > suspectible to this leak? Are the stalls noticeable and do they affect > > your performance targets? > > We didn't bother looking for any stalls, but for a long time we were > occasionally hitting this pin_count BUG i915_gem_object_pin. So it didn't in > fact affect our performance targets as much it completely wrecked our system. > > If this patch causes an occasional stall instead, given that this bug triggers > every 3-4 hours of uptime, we are fine with that. If a frame or so is missed > every couple hours on low end hardware we don't care that much. > > More on the actual workload... > > Only recently we got lucky and found a platform and workload where it happens > reliably. And this patch reliably fixes that. > > In this workload CPU is being loaded 50-60% decoding a movie and rendering it > to a full screen window. Our proprietary compositor page flips at 60Hz only, > not faster. Together with another small semi-transparent window being rendered > on top of the full screen movie. Movie played is a 25fps one, which means the > full screen window is damaged 25 out of 60 frames (give or take) which is when > we render to our back buffer and page flip at the vsync rate (60Hz). > > According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the > "Framebuffer Compression" metric which is maxed out, if that is one is at all > valid. > > This particular scenario triggers the bug only on two of our Atom based > platform both with a NM10/Pineview G/i915 chipset. Ah ok on Atom you're probably CPU constrained a bit, but still at 50-60% utilization the kthreads should be running at least sometimes... But it sounds like a case of the kthreads not running instead of queueing too fast anyway (not that the latter is really possible without some hacking to the flip code).
Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> writes: > According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the > "Framebuffer Compression" metric which is maxed out, if that is one is at all > valid. Often a bit is not actually hooked up to anything, in which case it will be interpreted as 100% busy. We should probably turn those off on those specific chips, but it's not a very well maintained tool (because it's not a super useful tool, unfortunately).
On Thursday 1 November 2012 09:58:51 Jesse Barnes wrote: > On Thu, 01 Nov 2012 16:52:05 +0000 > Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> wrote: > > > On Thursday 01 November 2012 16:20:03 Chris Wilson wrote: > > > On Thu, 1 Nov 2012 09:04:02 -0700, Jesse Barnes <jbarnes@virtuousgeek.org> > > wrote: > > > > On Thu, 01 Nov 2012 15:52:23 +0000 > > > > > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > > Actually I've justified the blocking here to myself, and prefer it to > > > > > simply running the crtc->unpin_work. If userspace is swamping the system > > > > > so badly that we can run the kthreads quick enough, it deserves a stall. > > > > > Note that the unpin leak is still about the 3rd most common bug in > > > > > fedora, > > > > > so this stall will be forced on many machines. > > > > > > > > Hm funky, why does Fedora hit it so much? Does some of the GNOME shell > > > > stuff run unthrottled or something? > > > > > > I don't think so. I trust that in Tvrtko's use case, he is not so much as > > > hogging the GPU as keeping the system as a whole relatively busy. So I > > > suspect it is more to do with CPU starvation of the kthreads than > > > anything else. > > > > > > Tvrtko, do you have any feeling for why your machine was easily > > > suspectible to this leak? Are the stalls noticeable and do they affect > > > your performance targets? > > > > We didn't bother looking for any stalls, but for a long time we were > > occasionally hitting this pin_count BUG i915_gem_object_pin. So it didn't in > > fact affect our performance targets as much it completely wrecked our system. > > > > If this patch causes an occasional stall instead, given that this bug triggers > > every 3-4 hours of uptime, we are fine with that. If a frame or so is missed > > every couple hours on low end hardware we don't care that much. > > > > More on the actual workload... > > > > Only recently we got lucky and found a platform and workload where it happens > > reliably. And this patch reliably fixes that. > > > > In this workload CPU is being loaded 50-60% decoding a movie and rendering it > > to a full screen window. Our proprietary compositor page flips at 60Hz only, > > not faster. Together with another small semi-transparent window being rendered > > on top of the full screen movie. Movie played is a 25fps one, which means the > > full screen window is damaged 25 out of 60 frames (give or take) which is when > > we render to our back buffer and page flip at the vsync rate (60Hz). > > > > According to intel_gpu_top tool, GPU load is roughly at 40%, apart from the > > "Framebuffer Compression" metric which is maxed out, if that is one is at all > > valid. > > > > This particular scenario triggers the bug only on two of our Atom based > > platform both with a NM10/Pineview G/i915 chipset. > > Ah ok on Atom you're probably CPU constrained a bit, but still at > 50-60% utilization the kthreads should be running at least sometimes... > > But it sounds like a case of the kthreads not running instead of > queueing too fast anyway (not that the latter is really possible > without some hacking to the flip code). > It may help you here to know that we run both our compositor and the X server at real-time priorities - both are SCHED_RR static priority 1 (the lowest realtime priority). IIRC, the kthreads run at SCHED_OTHER priority, so we are quite capable of starving them during a burst of activity.
On Thu, Nov 01, 2012 at 09:26:26AM +0000, Chris Wilson wrote: > If we accumulate unpin tasks because we are pageflipping faster than the > system can schedule its workers, we can effectively create a > pin-leak. The solution taken here is to limit the number of unpin tasks > we have per-crtc and to flush those outstanding tasks if we accumulate > too many. This should prevent any jitter in the normal case, and also > prevent the hang if we should run too fast. > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 > Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Queued for -next with a note added to the commit message about the workqueue related deadlock. Thanks for the patch. -Daniel
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 69b1739..800b195 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -6908,14 +6908,19 @@ static void intel_unpin_work_fn(struct work_struct *__work) { struct intel_unpin_work *work = container_of(__work, struct intel_unpin_work, work); + struct drm_device *dev = work->crtc->dev; - mutex_lock(&work->dev->struct_mutex); + mutex_lock(&dev->struct_mutex); intel_unpin_fb_obj(work->old_fb_obj); drm_gem_object_unreference(&work->pending_flip_obj->base); drm_gem_object_unreference(&work->old_fb_obj->base); - intel_update_fbc(work->dev); - mutex_unlock(&work->dev->struct_mutex); + intel_update_fbc(dev); + mutex_unlock(&dev->struct_mutex); + + BUG_ON(atomic_read(&to_intel_crtc(work->crtc)->unpin_work_count) == 0); + atomic_dec(&to_intel_crtc(work->crtc)->unpin_work_count); + kfree(work); } @@ -6963,9 +6968,9 @@ static void do_intel_finish_page_flip(struct drm_device *dev, atomic_clear_mask(1 << intel_crtc->plane, &obj->pending_flip.counter); - wake_up(&dev_priv->pending_flip_queue); - schedule_work(&work->work); + + queue_work(dev_priv->wq, &work->work); trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); } @@ -7266,7 +7271,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, return -ENOMEM; work->event = event; - work->dev = crtc->dev; + work->crtc = crtc; intel_fb = to_intel_framebuffer(crtc->fb); work->old_fb_obj = intel_fb->obj; INIT_WORK(&work->work, intel_unpin_work_fn); @@ -7291,6 +7296,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, intel_fb = to_intel_framebuffer(fb); obj = intel_fb->obj; + if (atomic_read(&intel_crtc->unpin_work_count) >= 2) + flush_workqueue(dev_priv->wq); + ret = i915_mutex_lock_interruptible(dev); if (ret) goto cleanup; @@ -7309,6 +7317,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, * the flip occurs and the object is no longer visible. */ atomic_add(1 << intel_crtc->plane, &work->old_fb_obj->pending_flip); + atomic_inc(&intel_crtc->unpin_work_count); ret = dev_priv->display.queue_flip(dev, crtc, fb, obj); if (ret) @@ -7323,6 +7332,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, return 0; cleanup_pending: + atomic_dec(&intel_crtc->unpin_work_count); atomic_sub(1 << intel_crtc->plane, &work->old_fb_obj->pending_flip); drm_gem_object_unreference(&work->old_fb_obj->base); drm_gem_object_unreference(&obj->base); diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 164696f..1345c44 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -216,6 +216,8 @@ struct intel_crtc { } vblank_work; int fdi_lanes; + atomic_t unpin_work_count; + /* Display surface base address adjustement for pageflips. Note that on * gen4+ this only adjusts up to a tile, offsets within a tile are * handled in the hw itself (with the TILEOFF register). */ @@ -403,7 +405,7 @@ intel_get_crtc_for_plane(struct drm_device *dev, int plane) struct intel_unpin_work { struct work_struct work; - struct drm_device *dev; + struct drm_crtc *crtc; struct drm_i915_gem_object *old_fb_obj; struct drm_i915_gem_object *pending_flip_obj; struct drm_pending_vblank_event *event;
If we accumulate unpin tasks because we are pageflipping faster than the system can schedule its workers, we can effectively create a pin-leak. The solution taken here is to limit the number of unpin tasks we have per-crtc and to flush those outstanding tasks if we accumulate too many. This should prevent any jitter in the normal case, and also prevent the hang if we should run too fast. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=46991 Reported-and-tested-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/intel_display.c | 22 ++++++++++++++++------ drivers/gpu/drm/i915/intel_drv.h | 4 +++- 2 files changed, 19 insertions(+), 7 deletions(-)