Message ID | f0cd66427039ce1bdc61460a9d833e6d858cff3e.1441363361.git.lukas@wunner.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jun 30, 2015 at 10:06:27AM +0100, Lukas Wunner wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > We had two failure modes here: > > 1. > Deadlock in intelfb_alloc failure path where it calls > drm_framebuffer_remove, which grabs the struct mutex and intelfb_create > (caller of intelfb_alloc) was already holding it. > > 2. > Deadlock in intelfb_create failure path where it calls > drm_framebuffer_unreference, which grabs the struct mutex and > intelfb_create was already holding it. > > v2: > * Reformat commit msg to 72 chars. (Lukas Wunner) > * Add third failure mode. (Lukas Wunner) > > v3: > * On fb alloc failure, unref gem object where it gets refed, > fix double unref in separate commit. (Ville Syrjälä) > > v4: > * Lock struct mutex on unref. (Chris Wilson) > > v5: > * Rebase on drm-intel-nightly 2015y-09m-04d-08h-19m-35s UTC, > rephrase commit message. (Jani Nicula) > > Tested-by: Pierre Moreau <pierre.morrow@free.fr> > [MBP 5,3 2009 nvidia 9400M + 9600M GT pre-retina] > Tested-by: Paul Hordiienko <pvt.gord@gmail.com> > [MBP 6,2 2010 intel ILK + nvidia GT216 pre-retina] > Tested-by: William Brown <william@blackhats.net.au> > [MBP 8,2 2011 intel SNB + amd turks pre-retina] > Tested-by: Lukas Wunner <lukas@wunner.de> > [MBP 9,1 2012 intel IVB + nvidia GK107 pre-retina] > Tested-by: Bruno Bierbaumer <bruno@bierbaumer.net> > [MBP 11,3 2013 intel HSW + nvidia GK107 retina] > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Fixes: 60a5ca015ffd ("drm/i915: Add locking around > framebuffer_references--") > Reported-by: Lukas Wunner <lukas@wunner.de> > [Lukas: Create v3 + v4 + v5 based on Tvrtko's v2] > Signed-off-by: Lukas Wunner <lukas@wunner.de> > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> > Cc: Jani Nikula <jani.nikula@intel.com> > --- > drivers/gpu/drm/i915/intel_fbdev.c | 20 ++++++++++++-------- > 1 file changed, 12 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c > index 96476d7..eee3306 100644 > --- a/drivers/gpu/drm/i915/intel_fbdev.c > +++ b/drivers/gpu/drm/i915/intel_fbdev.c > @@ -119,7 +119,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > { > struct intel_fbdev *ifbdev = > container_of(helper, struct intel_fbdev, helper); > - struct drm_framebuffer *fb; > + struct drm_framebuffer *fb = NULL; > struct drm_device *dev = helper->dev; > struct drm_mode_fb_cmd2 mode_cmd = {}; > struct drm_i915_gem_object *obj; > @@ -137,6 +137,8 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, > sizes->surface_depth); > > + mutex_lock(&dev->struct_mutex); > + > size = mode_cmd.pitches[0] * mode_cmd.height; > size = PAGE_ALIGN(size); > obj = i915_gem_object_create_stolen(dev, size); > @@ -158,18 +160,21 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > ret = intel_pin_and_fence_fb_obj(NULL, fb, NULL, NULL, NULL); > if (ret) { > DRM_ERROR("failed to pin obj: %d\n", ret); > - goto out_fb; > + goto out_unref; > } > > + mutex_unlock(&dev->struct_mutex); > + > ifbdev->fb = to_intel_framebuffer(fb); > > return 0; > > -out_fb: > - drm_framebuffer_remove(fb); > out_unref: > drm_gem_object_unreference(&obj->base); If fb init succeeded it took over the ref, no? So drm_framebuffer_remove() will now attempt to unref one too many times. This taking over refs stuff is confusing. Maybe it would be better if everyone just took an extra ref when they stash the obj pointer somewhere, and everyone would then always release whatever ref they own and no longer need. > out: > + mutex_unlock(&dev->struct_mutex); > + if (fb) > + drm_framebuffer_remove(fb); > return ret; > } > > @@ -187,8 +192,6 @@ static int intelfb_create(struct drm_fb_helper *helper, > int size, ret; > bool prealloc = false; > > - mutex_lock(&dev->struct_mutex); > - > if (intel_fb && > (sizes->fb_width > intel_fb->base.width || > sizes->fb_height > intel_fb->base.height)) { > @@ -203,7 +206,7 @@ static int intelfb_create(struct drm_fb_helper *helper, > DRM_DEBUG_KMS("no BIOS fb, allocating a new one\n"); > ret = intelfb_alloc(helper, sizes); > if (ret) > - goto out_unlock; > + return ret; > intel_fb = ifbdev->fb; > } else { > DRM_DEBUG_KMS("re-using BIOS fb\n"); > @@ -215,6 +218,8 @@ static int intelfb_create(struct drm_fb_helper *helper, > obj = intel_fb->obj; > size = obj->base.size; > > + mutex_lock(&dev->struct_mutex); > + I'm thinking we won't even need the lock here anymore. But maybe I'm missing something. > info = drm_fb_helper_alloc_fbi(helper); > if (IS_ERR(info)) { > ret = PTR_ERR(info); > @@ -276,7 +281,6 @@ out_destroy_fbi: > out_unpin: > i915_gem_object_ggtt_unpin(obj); > drm_gem_object_unreference(&obj->base); And this ref we don't own either AFAICS. > -out_unlock: > mutex_unlock(&dev->struct_mutex); > return ret; > } > -- > 2.1.0
On Tue, Oct 13, 2015 at 06:04:40PM +0300, Ville Syrjälä wrote: > > + mutex_lock(&dev->struct_mutex); > > + > > I'm thinking we won't even need the lock here anymore. But maybe I'm > missing something. Yeah, not so much here atm, but there is a bug here where we don't take an explicit pin on the VMA we setup for the fbdev which requires the lock. -chris
Hi Ville, On Tue, Oct 13, 2015 at 06:04:40PM +0300, Ville Syrjälä wrote: > On Tue, Jun 30, 2015 at 10:06:27AM +0100, Lukas Wunner wrote: > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > > We had two failure modes here: > > > > 1. > > Deadlock in intelfb_alloc failure path where it calls > > drm_framebuffer_remove, which grabs the struct mutex and intelfb_create > > (caller of intelfb_alloc) was already holding it. > > > > 2. > > Deadlock in intelfb_create failure path where it calls > > drm_framebuffer_unreference, which grabs the struct mutex and > > intelfb_create was already holding it. > > > > v2: > > * Reformat commit msg to 72 chars. (Lukas Wunner) > > * Add third failure mode. (Lukas Wunner) > > > > v3: > > * On fb alloc failure, unref gem object where it gets refed, > > fix double unref in separate commit. (Ville Syrjälä) > > > > v4: > > * Lock struct mutex on unref. (Chris Wilson) > > > > v5: > > * Rebase on drm-intel-nightly 2015y-09m-04d-08h-19m-35s UTC, > > rephrase commit message. (Jani Nicula) > > > > Tested-by: Pierre Moreau <pierre.morrow@free.fr> > > [MBP 5,3 2009 nvidia 9400M + 9600M GT pre-retina] > > Tested-by: Paul Hordiienko <pvt.gord@gmail.com> > > [MBP 6,2 2010 intel ILK + nvidia GT216 pre-retina] > > Tested-by: William Brown <william@blackhats.net.au> > > [MBP 8,2 2011 intel SNB + amd turks pre-retina] > > Tested-by: Lukas Wunner <lukas@wunner.de> > > [MBP 9,1 2012 intel IVB + nvidia GK107 pre-retina] > > Tested-by: Bruno Bierbaumer <bruno@bierbaumer.net> > > [MBP 11,3 2013 intel HSW + nvidia GK107 retina] > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Fixes: 60a5ca015ffd ("drm/i915: Add locking around > > framebuffer_references--") > > Reported-by: Lukas Wunner <lukas@wunner.de> > > [Lukas: Create v3 + v4 + v5 based on Tvrtko's v2] > > Signed-off-by: Lukas Wunner <lukas@wunner.de> > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> > > Cc: Jani Nikula <jani.nikula@intel.com> > > --- > > drivers/gpu/drm/i915/intel_fbdev.c | 20 ++++++++++++-------- > > 1 file changed, 12 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c > > index 96476d7..eee3306 100644 > > --- a/drivers/gpu/drm/i915/intel_fbdev.c > > +++ b/drivers/gpu/drm/i915/intel_fbdev.c > > @@ -119,7 +119,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > { > > struct intel_fbdev *ifbdev = > > container_of(helper, struct intel_fbdev, helper); > > - struct drm_framebuffer *fb; > > + struct drm_framebuffer *fb = NULL; > > struct drm_device *dev = helper->dev; > > struct drm_mode_fb_cmd2 mode_cmd = {}; > > struct drm_i915_gem_object *obj; > > @@ -137,6 +137,8 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, > > sizes->surface_depth); > > > > + mutex_lock(&dev->struct_mutex); > > + > > size = mode_cmd.pitches[0] * mode_cmd.height; > > size = PAGE_ALIGN(size); > > obj = i915_gem_object_create_stolen(dev, size); > > @@ -158,18 +160,21 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > ret = intel_pin_and_fence_fb_obj(NULL, fb, NULL, NULL, NULL); > > if (ret) { > > DRM_ERROR("failed to pin obj: %d\n", ret); > > - goto out_fb; > > + goto out_unref; > > } > > > > + mutex_unlock(&dev->struct_mutex); > > + > > ifbdev->fb = to_intel_framebuffer(fb); > > > > return 0; > > > > -out_fb: > > - drm_framebuffer_remove(fb); > > out_unref: > > drm_gem_object_unreference(&obj->base); > > If fb init succeeded it took over the ref, no? So drm_framebuffer_remove() > will now attempt to unref one too many times. > > This taking over refs stuff is confusing. Maybe it would be better if > everyone just took an extra ref when they stash the obj pointer > somewhere, and everyone would then always release whatever ref they own > and no longer need. > > > out: > > + mutex_unlock(&dev->struct_mutex); > > + if (fb) > > + drm_framebuffer_remove(fb); > > return ret; > > } > > Hm, why do you think we unref one too many times? A bit further up in this function we call __intel_framebuffer_create() which sets the refcount to 1. (It calls intel_framebuffer_init(), which calls drm_framebuffer_init(), which calls kref_init(&fb->refcount).) So if intel_pin_and_fence_fb_obj() fails, we do need to unreference and tear down the fb. Thus, drm_framebuffer_remove() seems right here to me. However, because of your objection I've noticed now that "if (fb)" seems to be wrong, I think this should be "if (!IS_ERR_OR_NULL(fb))". Because if __intel_framebuffer_create() failed, fb will be a PTR_ERR(), so not null, and we'd call drm_framebuffer_remove() on this. Is that what you meant? > > @@ -187,8 +192,6 @@ static int intelfb_create(struct drm_fb_helper *helper, > > int size, ret; > > bool prealloc = false; > > > > - mutex_lock(&dev->struct_mutex); > > - > > if (intel_fb && > > (sizes->fb_width > intel_fb->base.width || > > sizes->fb_height > intel_fb->base.height)) { > > @@ -203,7 +206,7 @@ static int intelfb_create(struct drm_fb_helper *helper, > > DRM_DEBUG_KMS("no BIOS fb, allocating a new one\n"); > > ret = intelfb_alloc(helper, sizes); > > if (ret) > > - goto out_unlock; > > + return ret; > > intel_fb = ifbdev->fb; > > } else { > > DRM_DEBUG_KMS("re-using BIOS fb\n"); > > @@ -215,6 +218,8 @@ static int intelfb_create(struct drm_fb_helper *helper, > > obj = intel_fb->obj; > > size = obj->base.size; > > > > + mutex_lock(&dev->struct_mutex); > > + > > I'm thinking we won't even need the lock here anymore. But maybe I'm > missing something. > > > info = drm_fb_helper_alloc_fbi(helper); > > if (IS_ERR(info)) { > > ret = PTR_ERR(info); > > @@ -276,7 +281,6 @@ out_destroy_fbi: > > out_unpin: > > i915_gem_object_ggtt_unpin(obj); > > drm_gem_object_unreference(&obj->base); > > And this ref we don't own either AFAICS. Why? We did call intelfb_alloc() above, so if something subsequently goes wrong, we need to revert the steps that intelfb_alloc() carried out. The drm_gem_object_unreference() therefore seems right here to me. However I'm puzzled why we don't call drm_framebuffer_remove() under the out_unpin: label. Aren't we leaking a framebuffer here without that? Maybe you're referring to the fact that this function either inherits the BIOS fb or creates a new fb with intelfb_alloc(). I'm not sure if the cleanup on error is identical in these two cases. Maybe you meant that we don't own the ref in the case that the fb was inherited from BIOS? Best regards, Lukas > > > -out_unlock: > > mutex_unlock(&dev->struct_mutex); > > return ret; > > } > > -- > > 2.1.0 > > -- > Ville Syrjälä > Intel OTC
On Thu, Oct 15, 2015 at 07:14:35PM +0200, Lukas Wunner wrote: > Hi Ville, > > On Tue, Oct 13, 2015 at 06:04:40PM +0300, Ville Syrjälä wrote: > > On Tue, Jun 30, 2015 at 10:06:27AM +0100, Lukas Wunner wrote: > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > > > > We had two failure modes here: > > > > > > 1. > > > Deadlock in intelfb_alloc failure path where it calls > > > drm_framebuffer_remove, which grabs the struct mutex and intelfb_create > > > (caller of intelfb_alloc) was already holding it. > > > > > > 2. > > > Deadlock in intelfb_create failure path where it calls > > > drm_framebuffer_unreference, which grabs the struct mutex and > > > intelfb_create was already holding it. > > > > > > v2: > > > * Reformat commit msg to 72 chars. (Lukas Wunner) > > > * Add third failure mode. (Lukas Wunner) > > > > > > v3: > > > * On fb alloc failure, unref gem object where it gets refed, > > > fix double unref in separate commit. (Ville Syrjälä) > > > > > > v4: > > > * Lock struct mutex on unref. (Chris Wilson) > > > > > > v5: > > > * Rebase on drm-intel-nightly 2015y-09m-04d-08h-19m-35s UTC, > > > rephrase commit message. (Jani Nicula) > > > > > > Tested-by: Pierre Moreau <pierre.morrow@free.fr> > > > [MBP 5,3 2009 nvidia 9400M + 9600M GT pre-retina] > > > Tested-by: Paul Hordiienko <pvt.gord@gmail.com> > > > [MBP 6,2 2010 intel ILK + nvidia GT216 pre-retina] > > > Tested-by: William Brown <william@blackhats.net.au> > > > [MBP 8,2 2011 intel SNB + amd turks pre-retina] > > > Tested-by: Lukas Wunner <lukas@wunner.de> > > > [MBP 9,1 2012 intel IVB + nvidia GK107 pre-retina] > > > Tested-by: Bruno Bierbaumer <bruno@bierbaumer.net> > > > [MBP 11,3 2013 intel HSW + nvidia GK107 retina] > > > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > Fixes: 60a5ca015ffd ("drm/i915: Add locking around > > > framebuffer_references--") > > > Reported-by: Lukas Wunner <lukas@wunner.de> > > > [Lukas: Create v3 + v4 + v5 based on Tvrtko's v2] > > > Signed-off-by: Lukas Wunner <lukas@wunner.de> > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> > > > Cc: Jani Nikula <jani.nikula@intel.com> > > > --- > > > drivers/gpu/drm/i915/intel_fbdev.c | 20 ++++++++++++-------- > > > 1 file changed, 12 insertions(+), 8 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c > > > index 96476d7..eee3306 100644 > > > --- a/drivers/gpu/drm/i915/intel_fbdev.c > > > +++ b/drivers/gpu/drm/i915/intel_fbdev.c > > > @@ -119,7 +119,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > { > > > struct intel_fbdev *ifbdev = > > > container_of(helper, struct intel_fbdev, helper); > > > - struct drm_framebuffer *fb; > > > + struct drm_framebuffer *fb = NULL; > > > struct drm_device *dev = helper->dev; > > > struct drm_mode_fb_cmd2 mode_cmd = {}; > > > struct drm_i915_gem_object *obj; > > > @@ -137,6 +137,8 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, > > > sizes->surface_depth); > > > > > > + mutex_lock(&dev->struct_mutex); > > > + > > > size = mode_cmd.pitches[0] * mode_cmd.height; > > > size = PAGE_ALIGN(size); > > > obj = i915_gem_object_create_stolen(dev, size); > > > @@ -158,18 +160,21 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > ret = intel_pin_and_fence_fb_obj(NULL, fb, NULL, NULL, NULL); > > > if (ret) { > > > DRM_ERROR("failed to pin obj: %d\n", ret); > > > - goto out_fb; > > > + goto out_unref; > > > } > > > > > > + mutex_unlock(&dev->struct_mutex); > > > + > > > ifbdev->fb = to_intel_framebuffer(fb); > > > > > > return 0; > > > > > > -out_fb: > > > - drm_framebuffer_remove(fb); > > > out_unref: > > > drm_gem_object_unreference(&obj->base); > > > > If fb init succeeded it took over the ref, no? So drm_framebuffer_remove() > > will now attempt to unref one too many times. > > > > This taking over refs stuff is confusing. Maybe it would be better if > > everyone just took an extra ref when they stash the obj pointer > > somewhere, and everyone would then always release whatever ref they own > > and no longer need. > > > > > out: > > > + mutex_unlock(&dev->struct_mutex); > > > + if (fb) > > > + drm_framebuffer_remove(fb); > > > return ret; > > > } > > > > > Hm, why do you think we unref one too many times? > > A bit further up in this function we call __intel_framebuffer_create() > which sets the refcount to 1. (It calls intel_framebuffer_init(), which > calls drm_framebuffer_init(), which calls kref_init(&fb->refcount).) > > So if intel_pin_and_fence_fb_obj() fails, we do need to unreference and > tear down the fb. Thus, drm_framebuffer_remove() seems right here to me. > > However, because of your objection I've noticed now that "if (fb)" seems > to be wrong, I think this should be "if (!IS_ERR_OR_NULL(fb))". > > Because if __intel_framebuffer_create() failed, fb will be a PTR_ERR(), > so not null, and we'd call drm_framebuffer_remove() on this. Is that > what you meant? Quick aside: drm_framebuffer_remove is for framebuffers that userspace has created. Since this is init code a drm_framebuffer_unreference should be all we need - and drm_framebuffer_remove is getting somewhat defeatured. -Daniel > > > > > @@ -187,8 +192,6 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > int size, ret; > > > bool prealloc = false; > > > > > > - mutex_lock(&dev->struct_mutex); > > > - > > > if (intel_fb && > > > (sizes->fb_width > intel_fb->base.width || > > > sizes->fb_height > intel_fb->base.height)) { > > > @@ -203,7 +206,7 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > DRM_DEBUG_KMS("no BIOS fb, allocating a new one\n"); > > > ret = intelfb_alloc(helper, sizes); > > > if (ret) > > > - goto out_unlock; > > > + return ret; > > > intel_fb = ifbdev->fb; > > > } else { > > > DRM_DEBUG_KMS("re-using BIOS fb\n"); > > > @@ -215,6 +218,8 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > obj = intel_fb->obj; > > > size = obj->base.size; > > > > > > + mutex_lock(&dev->struct_mutex); > > > + > > > > I'm thinking we won't even need the lock here anymore. But maybe I'm > > missing something. > > > > > info = drm_fb_helper_alloc_fbi(helper); > > > if (IS_ERR(info)) { > > > ret = PTR_ERR(info); > > > @@ -276,7 +281,6 @@ out_destroy_fbi: > > > out_unpin: > > > i915_gem_object_ggtt_unpin(obj); > > > drm_gem_object_unreference(&obj->base); > > > > And this ref we don't own either AFAICS. > > Why? We did call intelfb_alloc() above, so if something subsequently > goes wrong, we need to revert the steps that intelfb_alloc() carried > out. The drm_gem_object_unreference() therefore seems right here to me. > > However I'm puzzled why we don't call drm_framebuffer_remove() under > the out_unpin: label. Aren't we leaking a framebuffer here without that? > > Maybe you're referring to the fact that this function either inherits > the BIOS fb or creates a new fb with intelfb_alloc(). I'm not sure if > the cleanup on error is identical in these two cases. Maybe you meant > that we don't own the ref in the case that the fb was inherited from > BIOS? > > > Best regards, > > Lukas > > > > > > -out_unlock: > > > mutex_unlock(&dev->struct_mutex); > > > return ret; > > > } > > > -- > > > 2.1.0 > > > > -- > > Ville Syrjälä > > Intel OTC > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Thu, Oct 15, 2015 at 07:14:35PM +0200, Lukas Wunner wrote: > Hi Ville, > > On Tue, Oct 13, 2015 at 06:04:40PM +0300, Ville Syrjälä wrote: > > On Tue, Jun 30, 2015 at 10:06:27AM +0100, Lukas Wunner wrote: > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > > > > We had two failure modes here: > > > > > > 1. > > > Deadlock in intelfb_alloc failure path where it calls > > > drm_framebuffer_remove, which grabs the struct mutex and intelfb_create > > > (caller of intelfb_alloc) was already holding it. > > > > > > 2. > > > Deadlock in intelfb_create failure path where it calls > > > drm_framebuffer_unreference, which grabs the struct mutex and > > > intelfb_create was already holding it. > > > > > > v2: > > > * Reformat commit msg to 72 chars. (Lukas Wunner) > > > * Add third failure mode. (Lukas Wunner) > > > > > > v3: > > > * On fb alloc failure, unref gem object where it gets refed, > > > fix double unref in separate commit. (Ville Syrjälä) > > > > > > v4: > > > * Lock struct mutex on unref. (Chris Wilson) > > > > > > v5: > > > * Rebase on drm-intel-nightly 2015y-09m-04d-08h-19m-35s UTC, > > > rephrase commit message. (Jani Nicula) > > > > > > Tested-by: Pierre Moreau <pierre.morrow@free.fr> > > > [MBP 5,3 2009 nvidia 9400M + 9600M GT pre-retina] > > > Tested-by: Paul Hordiienko <pvt.gord@gmail.com> > > > [MBP 6,2 2010 intel ILK + nvidia GT216 pre-retina] > > > Tested-by: William Brown <william@blackhats.net.au> > > > [MBP 8,2 2011 intel SNB + amd turks pre-retina] > > > Tested-by: Lukas Wunner <lukas@wunner.de> > > > [MBP 9,1 2012 intel IVB + nvidia GK107 pre-retina] > > > Tested-by: Bruno Bierbaumer <bruno@bierbaumer.net> > > > [MBP 11,3 2013 intel HSW + nvidia GK107 retina] > > > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > Fixes: 60a5ca015ffd ("drm/i915: Add locking around > > > framebuffer_references--") > > > Reported-by: Lukas Wunner <lukas@wunner.de> > > > [Lukas: Create v3 + v4 + v5 based on Tvrtko's v2] > > > Signed-off-by: Lukas Wunner <lukas@wunner.de> > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> > > > Cc: Jani Nikula <jani.nikula@intel.com> > > > --- > > > drivers/gpu/drm/i915/intel_fbdev.c | 20 ++++++++++++-------- > > > 1 file changed, 12 insertions(+), 8 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c > > > index 96476d7..eee3306 100644 > > > --- a/drivers/gpu/drm/i915/intel_fbdev.c > > > +++ b/drivers/gpu/drm/i915/intel_fbdev.c > > > @@ -119,7 +119,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > { > > > struct intel_fbdev *ifbdev = > > > container_of(helper, struct intel_fbdev, helper); > > > - struct drm_framebuffer *fb; > > > + struct drm_framebuffer *fb = NULL; > > > struct drm_device *dev = helper->dev; > > > struct drm_mode_fb_cmd2 mode_cmd = {}; > > > struct drm_i915_gem_object *obj; > > > @@ -137,6 +137,8 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, > > > sizes->surface_depth); > > > > > > + mutex_lock(&dev->struct_mutex); > > > + > > > size = mode_cmd.pitches[0] * mode_cmd.height; > > > size = PAGE_ALIGN(size); > > > obj = i915_gem_object_create_stolen(dev, size); > > > @@ -158,18 +160,21 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > ret = intel_pin_and_fence_fb_obj(NULL, fb, NULL, NULL, NULL); > > > if (ret) { > > > DRM_ERROR("failed to pin obj: %d\n", ret); > > > - goto out_fb; > > > + goto out_unref; > > > } > > > > > > + mutex_unlock(&dev->struct_mutex); > > > + > > > ifbdev->fb = to_intel_framebuffer(fb); > > > > > > return 0; > > > > > > -out_fb: > > > - drm_framebuffer_remove(fb); > > > out_unref: > > > drm_gem_object_unreference(&obj->base); > > > > If fb init succeeded it took over the ref, no? So drm_framebuffer_remove() > > will now attempt to unref one too many times. > > > > This taking over refs stuff is confusing. Maybe it would be better if > > everyone just took an extra ref when they stash the obj pointer > > somewhere, and everyone would then always release whatever ref they own > > and no longer need. > > > > > out: > > > + mutex_unlock(&dev->struct_mutex); > > > + if (fb) > > > + drm_framebuffer_remove(fb); > > > return ret; > > > } > > > > > Hm, why do you think we unref one too many times? Because the fb now owns the reference, so it gets unreffed by the fb .destroy() hook... I think. > > A bit further up in this function we call __intel_framebuffer_create() > which sets the refcount to 1. (It calls intel_framebuffer_init(), which > calls drm_framebuffer_init(), which calls kref_init(&fb->refcount).) > > So if intel_pin_and_fence_fb_obj() fails, we do need to unreference and > tear down the fb. Thus, drm_framebuffer_remove() seems right here to me. I wasn't complaining about the fb unref, but the bo unref. > > However, because of your objection I've noticed now that "if (fb)" seems > to be wrong, I think this should be "if (!IS_ERR_OR_NULL(fb))". > > Because if __intel_framebuffer_create() failed, fb will be a PTR_ERR(), > so not null, and we'd call drm_framebuffer_remove() on this. Is that > what you meant? No, but that's a good observation too. > > > > > @@ -187,8 +192,6 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > int size, ret; > > > bool prealloc = false; > > > > > > - mutex_lock(&dev->struct_mutex); > > > - > > > if (intel_fb && > > > (sizes->fb_width > intel_fb->base.width || > > > sizes->fb_height > intel_fb->base.height)) { > > > @@ -203,7 +206,7 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > DRM_DEBUG_KMS("no BIOS fb, allocating a new one\n"); > > > ret = intelfb_alloc(helper, sizes); > > > if (ret) > > > - goto out_unlock; > > > + return ret; > > > intel_fb = ifbdev->fb; > > > } else { > > > DRM_DEBUG_KMS("re-using BIOS fb\n"); > > > @@ -215,6 +218,8 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > obj = intel_fb->obj; > > > size = obj->base.size; > > > > > > + mutex_lock(&dev->struct_mutex); > > > + > > > > I'm thinking we won't even need the lock here anymore. But maybe I'm > > missing something. > > > > > info = drm_fb_helper_alloc_fbi(helper); > > > if (IS_ERR(info)) { > > > ret = PTR_ERR(info); > > > @@ -276,7 +281,6 @@ out_destroy_fbi: > > > out_unpin: > > > i915_gem_object_ggtt_unpin(obj); > > > drm_gem_object_unreference(&obj->base); > > > > And this ref we don't own either AFAICS. > > Why? We did call intelfb_alloc() above, so if something subsequently > goes wrong, we need to revert the steps that intelfb_alloc() carried > out. The drm_gem_object_unreference() therefore seems right here to me. Here too the fb (if succesfully created) now owns that reference. > > However I'm puzzled why we don't call drm_framebuffer_remove() under > the out_unpin: label. Aren't we leaking a framebuffer here without that? > > Maybe you're referring to the fact that this function either inherits > the BIOS fb or creates a new fb with intelfb_alloc(). I'm not sure if > the cleanup on error is identical in these two cases. Maybe you meant > that we don't own the ref in the case that the fb was inherited from > BIOS? > > > Best regards, > > Lukas > > > > > > -out_unlock: > > > mutex_unlock(&dev->struct_mutex); > > > return ret; > > > } > > > -- > > > 2.1.0 > > > > -- > > Ville Syrjälä > > Intel OTC
Hi Ville, On Thu, Oct 15, 2015 at 08:34:23PM +0300, Ville Syrjälä wrote: > On Thu, Oct 15, 2015 at 07:14:35PM +0200, Lukas Wunner wrote: > > Hi Ville, > > > > On Tue, Oct 13, 2015 at 06:04:40PM +0300, Ville Syrjälä wrote: > > > On Tue, Jun 30, 2015 at 10:06:27AM +0100, Lukas Wunner wrote: > > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > > > > > > We had two failure modes here: > > > > > > > > 1. > > > > Deadlock in intelfb_alloc failure path where it calls > > > > drm_framebuffer_remove, which grabs the struct mutex and intelfb_create > > > > (caller of intelfb_alloc) was already holding it. > > > > > > > > 2. > > > > Deadlock in intelfb_create failure path where it calls > > > > drm_framebuffer_unreference, which grabs the struct mutex and > > > > intelfb_create was already holding it. > > > > > > > > v2: > > > > * Reformat commit msg to 72 chars. (Lukas Wunner) > > > > * Add third failure mode. (Lukas Wunner) > > > > > > > > v3: > > > > * On fb alloc failure, unref gem object where it gets refed, > > > > fix double unref in separate commit. (Ville Syrjälä) > > > > > > > > v4: > > > > * Lock struct mutex on unref. (Chris Wilson) > > > > > > > > v5: > > > > * Rebase on drm-intel-nightly 2015y-09m-04d-08h-19m-35s UTC, > > > > rephrase commit message. (Jani Nicula) > > > > > > > > Tested-by: Pierre Moreau <pierre.morrow@free.fr> > > > > [MBP 5,3 2009 nvidia 9400M + 9600M GT pre-retina] > > > > Tested-by: Paul Hordiienko <pvt.gord@gmail.com> > > > > [MBP 6,2 2010 intel ILK + nvidia GT216 pre-retina] > > > > Tested-by: William Brown <william@blackhats.net.au> > > > > [MBP 8,2 2011 intel SNB + amd turks pre-retina] > > > > Tested-by: Lukas Wunner <lukas@wunner.de> > > > > [MBP 9,1 2012 intel IVB + nvidia GK107 pre-retina] > > > > Tested-by: Bruno Bierbaumer <bruno@bierbaumer.net> > > > > [MBP 11,3 2013 intel HSW + nvidia GK107 retina] > > > > > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > > Fixes: 60a5ca015ffd ("drm/i915: Add locking around > > > > framebuffer_references--") > > > > Reported-by: Lukas Wunner <lukas@wunner.de> > > > > [Lukas: Create v3 + v4 + v5 based on Tvrtko's v2] > > > > Signed-off-by: Lukas Wunner <lukas@wunner.de> > > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> > > > > Cc: Jani Nikula <jani.nikula@intel.com> > > > > --- > > > > drivers/gpu/drm/i915/intel_fbdev.c | 20 ++++++++++++-------- > > > > 1 file changed, 12 insertions(+), 8 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c > > > > index 96476d7..eee3306 100644 > > > > --- a/drivers/gpu/drm/i915/intel_fbdev.c > > > > +++ b/drivers/gpu/drm/i915/intel_fbdev.c > > > > @@ -119,7 +119,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > > { > > > > struct intel_fbdev *ifbdev = > > > > container_of(helper, struct intel_fbdev, helper); > > > > - struct drm_framebuffer *fb; > > > > + struct drm_framebuffer *fb = NULL; > > > > struct drm_device *dev = helper->dev; > > > > struct drm_mode_fb_cmd2 mode_cmd = {}; > > > > struct drm_i915_gem_object *obj; > > > > @@ -137,6 +137,8 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > > mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, > > > > sizes->surface_depth); > > > > > > > > + mutex_lock(&dev->struct_mutex); > > > > + > > > > size = mode_cmd.pitches[0] * mode_cmd.height; > > > > size = PAGE_ALIGN(size); > > > > obj = i915_gem_object_create_stolen(dev, size); > > > > @@ -158,18 +160,21 @@ static int intelfb_alloc(struct drm_fb_helper *helper, > > > > ret = intel_pin_and_fence_fb_obj(NULL, fb, NULL, NULL, NULL); > > > > if (ret) { > > > > DRM_ERROR("failed to pin obj: %d\n", ret); > > > > - goto out_fb; > > > > + goto out_unref; > > > > } > > > > > > > > + mutex_unlock(&dev->struct_mutex); > > > > + > > > > ifbdev->fb = to_intel_framebuffer(fb); > > > > > > > > return 0; > > > > > > > > -out_fb: > > > > - drm_framebuffer_remove(fb); > > > > out_unref: > > > > drm_gem_object_unreference(&obj->base); > > > > > > If fb init succeeded it took over the ref, no? So drm_framebuffer_remove() > > > will now attempt to unref one too many times. > > > > > > This taking over refs stuff is confusing. Maybe it would be better if > > > everyone just took an extra ref when they stash the obj pointer > > > somewhere, and everyone would then always release whatever ref they own > > > and no longer need. > > > > > > > out: > > > > + mutex_unlock(&dev->struct_mutex); > > > > + if (fb) > > > > + drm_framebuffer_remove(fb); > > > > return ret; > > > > } > > > > > > > > Hm, why do you think we unref one too many times? > > Because the fb now owns the reference, so it gets unreffed by the fb > .destroy() hook... I think. You're right. drm_framebuffer_remove() calls drm_framebuffer_unreference(), if this was the last ref then drm_framebuffer_free() gets called, which invokes the ->destroy callback intel_user_framebuffer_destroy(), which in turn calls drm_gem_object_unreference(). So indeed it gets unrefed twice here. > > > > A bit further up in this function we call __intel_framebuffer_create() > > which sets the refcount to 1. (It calls intel_framebuffer_init(), which > > calls drm_framebuffer_init(), which calls kref_init(&fb->refcount).) > > > > So if intel_pin_and_fence_fb_obj() fails, we do need to unreference and > > tear down the fb. Thus, drm_framebuffer_remove() seems right here to me. > > I wasn't complaining about the fb unref, but the bo unref. > > > > > However, because of your objection I've noticed now that "if (fb)" seems > > to be wrong, I think this should be "if (!IS_ERR_OR_NULL(fb))". > > > > Because if __intel_framebuffer_create() failed, fb will be a PTR_ERR(), > > so not null, and we'd call drm_framebuffer_remove() on this. Is that > > what you meant? > > No, but that's a good observation too. > > > > > > > > > @@ -187,8 +192,6 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > > int size, ret; > > > > bool prealloc = false; > > > > > > > > - mutex_lock(&dev->struct_mutex); > > > > - > > > > if (intel_fb && > > > > (sizes->fb_width > intel_fb->base.width || > > > > sizes->fb_height > intel_fb->base.height)) { > > > > @@ -203,7 +206,7 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > > DRM_DEBUG_KMS("no BIOS fb, allocating a new one\n"); > > > > ret = intelfb_alloc(helper, sizes); > > > > if (ret) > > > > - goto out_unlock; > > > > + return ret; > > > > intel_fb = ifbdev->fb; > > > > } else { > > > > DRM_DEBUG_KMS("re-using BIOS fb\n"); > > > > @@ -215,6 +218,8 @@ static int intelfb_create(struct drm_fb_helper *helper, > > > > obj = intel_fb->obj; > > > > size = obj->base.size; > > > > > > > > + mutex_lock(&dev->struct_mutex); > > > > + > > > > > > I'm thinking we won't even need the lock here anymore. But maybe I'm > > > missing something. > > > > > > > info = drm_fb_helper_alloc_fbi(helper); > > > > if (IS_ERR(info)) { > > > > ret = PTR_ERR(info); > > > > @@ -276,7 +281,6 @@ out_destroy_fbi: > > > > out_unpin: > > > > i915_gem_object_ggtt_unpin(obj); > > > > drm_gem_object_unreference(&obj->base); > > > > > > And this ref we don't own either AFAICS. > > > > Why? We did call intelfb_alloc() above, so if something subsequently > > goes wrong, we need to revert the steps that intelfb_alloc() carried > > out. The drm_gem_object_unreference() therefore seems right here to me. > > Here too the fb (if succesfully created) now owns that reference. But here the ->destroy callback isn't invoked because the fb isn't unrefed. There's no call to drm_framebuffer_unreference() / drm_framebuffer_remove(), so the gem object isn't unrefed twice but we seem to leak the fb. That's exactly what I find confusing here, if intelfb_alloc() above was successful and something else subsequently goes awry, we need to unref the fb, right? Or am I missing something? In the case when we've inherited the fb from BIOS (instead of creating it with intelfb_alloc()), is it okay to unref the fb as well? Best regards, Lukas > > However I'm puzzled why we don't call drm_framebuffer_remove() under > > the out_unpin: label. Aren't we leaking a framebuffer here without that? > > > > Maybe you're referring to the fact that this function either inherits > > the BIOS fb or creates a new fb with intelfb_alloc(). I'm not sure if > > the cleanup on error is identical in these two cases. Maybe you meant > > that we don't own the ref in the case that the fb was inherited from > > BIOS? > > > > > > Best regards, > > > > Lukas > > > > > > > > > -out_unlock: > > > > mutex_unlock(&dev->struct_mutex); > > > > return ret; > > > > } > > > > -- > > > > 2.1.0 > > > > > > -- > > > Ville Syrjälä > > > Intel OTC > > -- > Ville Syrjälä > Intel OTC
Next iteration of these fixes, taking into account the additional issues discovered in the recent exchange with Ville (this series is in-reply-to that thread): * Patch 1 now with Reviewed-by tag by Ville (thanks!), moved to the front of the series. * Patch 2 is new, fixes a double unref of the bo in a failure path of intelfb_alloc, spotted by Ville. * Patch 3 is another attempt to eliminate the deadlocks in intelfb_alloc and intelfb_create, patch originally by Tvrtko. New in v6: Avoid unrefing the fb if it's an ERR_PTR, replace drm_framebuffer_remove with drm_framebuffer_unreference (Daniel). * Patch 4 is new, plugs leak of a struct intel_fbdev in intelfb_create. Not sure if I made the right call in the case where the fb was inherited from BIOS, comments welcome. (I decided to leave the fb as is and limp on. Alternative would be to call drm_framebuffer_remove to free the memory occupied by the fb and disable the crtc; the fbcon will be unusable anyway at this point and if X11 is able to start up without errors, it should be able to reinitialize the crtc.) Browsable on GitHub: https://github.com/l1k/linux/commits/intel_fbdev_fixes Lukas Wunner (3): drm/i915: On fb alloc failure, unref gem object where it gets refed drm/i915: Fix double unref in intelfb_alloc failure path drm/i915: Fix error handling in intelfb_create Tvrtko Ursulin (1): drm/i915: Fix failure paths around initial fbdev allocation drivers/gpu/drm/i915/intel_display.c | 21 +++++++++++++------- drivers/gpu/drm/i915/intel_fbdev.c | 38 +++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c index 96476d7..eee3306 100644 --- a/drivers/gpu/drm/i915/intel_fbdev.c +++ b/drivers/gpu/drm/i915/intel_fbdev.c @@ -119,7 +119,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper, { struct intel_fbdev *ifbdev = container_of(helper, struct intel_fbdev, helper); - struct drm_framebuffer *fb; + struct drm_framebuffer *fb = NULL; struct drm_device *dev = helper->dev; struct drm_mode_fb_cmd2 mode_cmd = {}; struct drm_i915_gem_object *obj; @@ -137,6 +137,8 @@ static int intelfb_alloc(struct drm_fb_helper *helper, mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, sizes->surface_depth); + mutex_lock(&dev->struct_mutex); + size = mode_cmd.pitches[0] * mode_cmd.height; size = PAGE_ALIGN(size); obj = i915_gem_object_create_stolen(dev, size); @@ -158,18 +160,21 @@ static int intelfb_alloc(struct drm_fb_helper *helper, ret = intel_pin_and_fence_fb_obj(NULL, fb, NULL, NULL, NULL); if (ret) { DRM_ERROR("failed to pin obj: %d\n", ret); - goto out_fb; + goto out_unref; } + mutex_unlock(&dev->struct_mutex); + ifbdev->fb = to_intel_framebuffer(fb); return 0; -out_fb: - drm_framebuffer_remove(fb); out_unref: drm_gem_object_unreference(&obj->base); out: + mutex_unlock(&dev->struct_mutex); + if (fb) + drm_framebuffer_remove(fb); return ret; } @@ -187,8 +192,6 @@ static int intelfb_create(struct drm_fb_helper *helper, int size, ret; bool prealloc = false; - mutex_lock(&dev->struct_mutex); - if (intel_fb && (sizes->fb_width > intel_fb->base.width || sizes->fb_height > intel_fb->base.height)) { @@ -203,7 +206,7 @@ static int intelfb_create(struct drm_fb_helper *helper, DRM_DEBUG_KMS("no BIOS fb, allocating a new one\n"); ret = intelfb_alloc(helper, sizes); if (ret) - goto out_unlock; + return ret; intel_fb = ifbdev->fb; } else { DRM_DEBUG_KMS("re-using BIOS fb\n"); @@ -215,6 +218,8 @@ static int intelfb_create(struct drm_fb_helper *helper, obj = intel_fb->obj; size = obj->base.size; + mutex_lock(&dev->struct_mutex); + info = drm_fb_helper_alloc_fbi(helper); if (IS_ERR(info)) { ret = PTR_ERR(info); @@ -276,7 +281,6 @@ out_destroy_fbi: out_unpin: i915_gem_object_ggtt_unpin(obj); drm_gem_object_unreference(&obj->base); -out_unlock: mutex_unlock(&dev->struct_mutex); return ret; }