Message ID | 1342185256-16024-3-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, 13 Jul 2012 14:14:05 +0100 Chris Wilson <chris@chris-wilson.co.uk> wrote: > Otherwise we end up trying to unpin a freed object and BUG. > Just a quick excuse: originally do_switch was just a very low level helper, and everything called i915_switch_context (including default setup), so the bug likely didn't exist in older revisions of the series. Since you're moving around so much code, maybe we could just move do_switch() a bit higher in the file so we don't have to do the forward declaration crap anymore? I am a bit confused why I didn't hit this in my testing, but thanks for tracking it down. I must say though, really like this patch, and the cleanups it does in addition to fixing the bug is way understated in the commit message. Still, a couple of comments below. > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Ben Widawsky <ben@bwidawsk.net> > --- > drivers/gpu/drm/i915/i915_gem_context.c | 82 +++++++++++-------------------- > 1 file changed, 30 insertions(+), 52 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index 90857f8..b0a855f 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -97,8 +97,7 @@ > > static struct i915_hw_context * > i915_gem_context_get(struct drm_i915_file_private *file_priv, u32 id); > -static int do_switch(struct drm_i915_gem_object *from_obj, > - struct i915_hw_context *to, u32 seqno); > +static int do_switch(struct i915_hw_context *to); > > static int get_context_size(struct drm_device *dev) > { > @@ -220,26 +219,24 @@ static int create_default_context(struct drm_i915_private *dev_priv) > */ > dev_priv->ring[RCS].default_context = ctx; > ret = i915_gem_object_pin(ctx->obj, CONTEXT_ALIGN, false); > - if (ret) { > - do_destroy(ctx); > - return ret; > - } > + if (ret) > + goto err_destroy; > > ret = i915_gem_object_set_to_gtt_domain(ctx->obj, true); > - if (ret) { > - i915_gem_object_unpin(ctx->obj); > - do_destroy(ctx); > - return ret; > - } > + if (ret) > + goto err_unpin; > > - ret = do_switch(NULL, ctx, 0); > - if (ret) { > - i915_gem_object_unpin(ctx->obj); > - do_destroy(ctx); > - } else { > - DRM_DEBUG_DRIVER("Default HW context loaded\n"); > - } > + ret = do_switch(ctx); > + if (ret) > + goto err_unpin; > > + DRM_DEBUG_DRIVER("Default HW context loaded\n"); > + return 0; > + > +err_unpin: > + i915_gem_object_unpin(ctx->obj); > +err_destroy: > + do_destroy(ctx); > return ret; > } > > @@ -366,16 +363,14 @@ mi_set_context(struct intel_ring_buffer *ring, > return ret; > } > > -static int do_switch(struct drm_i915_gem_object *from_obj, > - struct i915_hw_context *to, > - u32 seqno) > +static int do_switch(struct i915_hw_context *to) > { > - struct intel_ring_buffer *ring = NULL; > + struct drm_i915_gem_object *from_obj = to->ring->last_context_obj; > u32 hw_flags = 0; > int ret; > > - BUG_ON(to == NULL); > - BUG_ON(from_obj != NULL && from_obj->pin_count == 0); > + if (from_obj == to->obj) > + return 0; The second BUG_ON I think is still fairly useful (though it would be nicer to have a context_id in the object so we could check at the time of unpin). > > ret = i915_gem_object_pin(to->obj, CONTEXT_ALIGN, false); > if (ret) > @@ -389,8 +384,7 @@ static int do_switch(struct drm_i915_gem_object *from_obj, > else if (WARN_ON_ONCE(from_obj == to->obj)) /* not yet expected */ > hw_flags |= MI_FORCE_RESTORE; > > - ring = to->ring; > - ret = mi_set_context(ring, to, hw_flags); > + ret = mi_set_context(to->ring, to, hw_flags); > if (ret) { > i915_gem_object_unpin(to->obj); > return ret; > @@ -403,6 +397,7 @@ static int do_switch(struct drm_i915_gem_object *from_obj, > * MI_SET_CONTEXT instead of when the next seqno has completed. > */ > if (from_obj != NULL) { > + u32 seqno = i915_gem_next_request_seqno(to->ring); I haven't really looked hard, but we do end up here quite early in init, reset, unfreeze. Hopefully this is always safe to do. > /* As long as MI_SET_CONTEXT is serializing, ie. it flushes the > * whole damn pipeline, we don't need to explicitly mark the > * object dirty. The only exception is that the context must be > @@ -412,13 +407,15 @@ static int do_switch(struct drm_i915_gem_object *from_obj, > */ > from_obj->base.write_domain = I915_GEM_DOMAIN_INSTRUCTION; > from_obj->base.read_domains = I915_GEM_DOMAIN_INSTRUCTION; > - i915_gem_object_move_to_active(from_obj, ring, seqno); > + i915_gem_object_move_to_active(from_obj, to->ring, seqno); > from_obj->dirty = 1; > BUG_ON(from_obj->ring != to->ring); > i915_gem_object_unpin(from_obj); > + drm_gem_object_unreference(&from_obj->base); > } > > - ring->last_context_obj = to->obj; > + drm_gem_object_reference(&to->obj->base); > + to->ring->last_context_obj = to->obj; > to->is_initialized = true; > > return 0; > @@ -442,10 +439,7 @@ int i915_switch_context(struct intel_ring_buffer *ring, > int to_id) > { > struct drm_i915_private *dev_priv = ring->dev->dev_private; > - struct drm_i915_file_private *file_priv = NULL; > struct i915_hw_context *to; > - struct drm_i915_gem_object *from_obj = ring->last_context_obj; > - int ret; > > if (dev_priv->hw_contexts_disabled) > return 0; > @@ -453,34 +447,18 @@ int i915_switch_context(struct intel_ring_buffer *ring, > if (ring != &dev_priv->ring[RCS]) > return 0; > > - if (file) > - file_priv = file->driver_priv; > - > if (to_id == DEFAULT_CONTEXT_ID) { > to = ring->default_context; > } else { > - to = i915_gem_context_get(file_priv, to_id); > + if (file == NULL) > + return -EINVAL; > + > + to = i915_gem_context_get(file->driver_priv, to_id); > if (to == NULL) > return -ENOENT; > } > > - if (from_obj == to->obj) > - return 0; > - > - ret = do_switch(from_obj, to, i915_gem_next_request_seqno(to->ring)); > - if (ret) > - return ret; > - > - /* Just to make the code a little cleaner we take the object reference > - * after the switch was successful. It would be more intuitive to ref > - * the 'to' object before the switch but we know the refcount must be >0 > - * if context_get() succeeded, and we hold struct mutex. So it's safe to > - * do this here/now > - */ > - drm_gem_object_reference(&to->obj->base); > - if (from_obj != NULL) > - drm_gem_object_unreference(&from_obj->base); > - return ret; > + return do_switch(to); > } > > int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
On Fri, Jul 13, 2012 at 02:14:05PM +0100, Chris Wilson wrote: > Otherwise we end up trying to unpin a freed object and BUG. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Ben Widawsky <ben@bwidawsk.net> Afact this patch contains quite some code refactoring that does not relate directly to the fix (or if it does, I fail to see the direct relevance). So I think this either needs an explanation in the commit message or be put into a separate patch (I agree though for actual code cleanups). For the fix itself I seem to be a bit dense again - the only thing I see is that you move the refcount handling into do_switch. Afacs we do the ref-handling in both cases only when do_switch is successful, and also right at the end of do_switch (or right afterwards). So can you please enlighten your clueless maintainer a bit an explain how things blow up? Yours, Daniel > --- > drivers/gpu/drm/i915/i915_gem_context.c | 82 +++++++++++-------------------- > 1 file changed, 30 insertions(+), 52 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index 90857f8..b0a855f 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -97,8 +97,7 @@ > > static struct i915_hw_context * > i915_gem_context_get(struct drm_i915_file_private *file_priv, u32 id); > -static int do_switch(struct drm_i915_gem_object *from_obj, > - struct i915_hw_context *to, u32 seqno); > +static int do_switch(struct i915_hw_context *to); > > static int get_context_size(struct drm_device *dev) > { > @@ -220,26 +219,24 @@ static int create_default_context(struct drm_i915_private *dev_priv) > */ > dev_priv->ring[RCS].default_context = ctx; > ret = i915_gem_object_pin(ctx->obj, CONTEXT_ALIGN, false); > - if (ret) { > - do_destroy(ctx); > - return ret; > - } > + if (ret) > + goto err_destroy; > > ret = i915_gem_object_set_to_gtt_domain(ctx->obj, true); > - if (ret) { > - i915_gem_object_unpin(ctx->obj); > - do_destroy(ctx); > - return ret; > - } > + if (ret) > + goto err_unpin; > > - ret = do_switch(NULL, ctx, 0); > - if (ret) { > - i915_gem_object_unpin(ctx->obj); > - do_destroy(ctx); > - } else { > - DRM_DEBUG_DRIVER("Default HW context loaded\n"); > - } > + ret = do_switch(ctx); > + if (ret) > + goto err_unpin; > > + DRM_DEBUG_DRIVER("Default HW context loaded\n"); > + return 0; > + > +err_unpin: > + i915_gem_object_unpin(ctx->obj); > +err_destroy: > + do_destroy(ctx); > return ret; > } > > @@ -366,16 +363,14 @@ mi_set_context(struct intel_ring_buffer *ring, > return ret; > } > > -static int do_switch(struct drm_i915_gem_object *from_obj, > - struct i915_hw_context *to, > - u32 seqno) > +static int do_switch(struct i915_hw_context *to) > { > - struct intel_ring_buffer *ring = NULL; > + struct drm_i915_gem_object *from_obj = to->ring->last_context_obj; > u32 hw_flags = 0; > int ret; > > - BUG_ON(to == NULL); > - BUG_ON(from_obj != NULL && from_obj->pin_count == 0); > + if (from_obj == to->obj) > + return 0; > > ret = i915_gem_object_pin(to->obj, CONTEXT_ALIGN, false); > if (ret) > @@ -389,8 +384,7 @@ static int do_switch(struct drm_i915_gem_object *from_obj, > else if (WARN_ON_ONCE(from_obj == to->obj)) /* not yet expected */ > hw_flags |= MI_FORCE_RESTORE; > > - ring = to->ring; > - ret = mi_set_context(ring, to, hw_flags); > + ret = mi_set_context(to->ring, to, hw_flags); > if (ret) { > i915_gem_object_unpin(to->obj); > return ret; > @@ -403,6 +397,7 @@ static int do_switch(struct drm_i915_gem_object *from_obj, > * MI_SET_CONTEXT instead of when the next seqno has completed. > */ > if (from_obj != NULL) { > + u32 seqno = i915_gem_next_request_seqno(to->ring); > /* As long as MI_SET_CONTEXT is serializing, ie. it flushes the > * whole damn pipeline, we don't need to explicitly mark the > * object dirty. The only exception is that the context must be > @@ -412,13 +407,15 @@ static int do_switch(struct drm_i915_gem_object *from_obj, > */ > from_obj->base.write_domain = I915_GEM_DOMAIN_INSTRUCTION; > from_obj->base.read_domains = I915_GEM_DOMAIN_INSTRUCTION; > - i915_gem_object_move_to_active(from_obj, ring, seqno); > + i915_gem_object_move_to_active(from_obj, to->ring, seqno); > from_obj->dirty = 1; > BUG_ON(from_obj->ring != to->ring); > i915_gem_object_unpin(from_obj); > + drm_gem_object_unreference(&from_obj->base); > } > > - ring->last_context_obj = to->obj; > + drm_gem_object_reference(&to->obj->base); > + to->ring->last_context_obj = to->obj; > to->is_initialized = true; > > return 0; > @@ -442,10 +439,7 @@ int i915_switch_context(struct intel_ring_buffer *ring, > int to_id) > { > struct drm_i915_private *dev_priv = ring->dev->dev_private; > - struct drm_i915_file_private *file_priv = NULL; > struct i915_hw_context *to; > - struct drm_i915_gem_object *from_obj = ring->last_context_obj; > - int ret; > > if (dev_priv->hw_contexts_disabled) > return 0; > @@ -453,34 +447,18 @@ int i915_switch_context(struct intel_ring_buffer *ring, > if (ring != &dev_priv->ring[RCS]) > return 0; > > - if (file) > - file_priv = file->driver_priv; > - > if (to_id == DEFAULT_CONTEXT_ID) { > to = ring->default_context; > } else { > - to = i915_gem_context_get(file_priv, to_id); > + if (file == NULL) > + return -EINVAL; > + > + to = i915_gem_context_get(file->driver_priv, to_id); > if (to == NULL) > return -ENOENT; > } > > - if (from_obj == to->obj) > - return 0; > - > - ret = do_switch(from_obj, to, i915_gem_next_request_seqno(to->ring)); > - if (ret) > - return ret; > - > - /* Just to make the code a little cleaner we take the object reference > - * after the switch was successful. It would be more intuitive to ref > - * the 'to' object before the switch but we know the refcount must be >0 > - * if context_get() succeeded, and we hold struct mutex. So it's safe to > - * do this here/now > - */ > - drm_gem_object_reference(&to->obj->base); > - if (from_obj != NULL) > - drm_gem_object_unreference(&from_obj->base); > - return ret; > + return do_switch(to); > } > > int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, > -- > 1.7.10.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Fri, 13 Jul 2012 17:37:14 +0200, Daniel Vetter <daniel@ffwll.ch> wrote: > On Fri, Jul 13, 2012 at 02:14:05PM +0100, Chris Wilson wrote: > > Otherwise we end up trying to unpin a freed object and BUG. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Ben Widawsky <ben@bwidawsk.net> > > Afact this patch contains quite some code refactoring that does not > relate directly to the fix (or if it does, I fail to see the direct > relevance). So I think this either needs an explanation in the commit > message or be put into a separate patch (I agree though for actual code > cleanups). > > For the fix itself I seem to be a bit dense again - the only thing I see > is that you move the refcount handling into do_switch. Afacs we do the > ref-handling in both cases only when do_switch is successful, and also > right at the end of do_switch (or right afterwards). So can you please > enlighten your clueless maintainer a bit an explain how things blow up? The fix is that the reference handling was only done on one path, not both. Hence the default_ctx ends up being used-after-free. The rest of it was just unwinding the code to get to finding the bug... -Chris
On Sat, Jul 14, 2012 at 10:55:19AM +0100, Chris Wilson wrote: > On Fri, 13 Jul 2012 17:37:14 +0200, Daniel Vetter <daniel@ffwll.ch> wrote: > > On Fri, Jul 13, 2012 at 02:14:05PM +0100, Chris Wilson wrote: > > > Otherwise we end up trying to unpin a freed object and BUG. > > > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > Cc: Ben Widawsky <ben@bwidawsk.net> > > > > Afact this patch contains quite some code refactoring that does not > > relate directly to the fix (or if it does, I fail to see the direct > > relevance). So I think this either needs an explanation in the commit > > message or be put into a separate patch (I agree though for actual code > > cleanups). > > > > For the fix itself I seem to be a bit dense again - the only thing I see > > is that you move the refcount handling into do_switch. Afacs we do the > > ref-handling in both cases only when do_switch is successful, and also > > right at the end of do_switch (or right afterwards). So can you please > > enlighten your clueless maintainer a bit an explain how things blow up? > > The fix is that the reference handling was only done on one path, not > both. Hence the default_ctx ends up being used-after-free. > > The rest of it was just unwinding the code to get to finding the bug... Yeah, I've figured this out somewhat later yesterday. Still, a bugfix shouldn't resemble a riddle more than a simple patch ... Afacs we only need to move the reference count handling from i915_switch_context to do_switch (and mention in the commit message that it's been missing when creating the default context). -Daniel
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 90857f8..b0a855f 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -97,8 +97,7 @@ static struct i915_hw_context * i915_gem_context_get(struct drm_i915_file_private *file_priv, u32 id); -static int do_switch(struct drm_i915_gem_object *from_obj, - struct i915_hw_context *to, u32 seqno); +static int do_switch(struct i915_hw_context *to); static int get_context_size(struct drm_device *dev) { @@ -220,26 +219,24 @@ static int create_default_context(struct drm_i915_private *dev_priv) */ dev_priv->ring[RCS].default_context = ctx; ret = i915_gem_object_pin(ctx->obj, CONTEXT_ALIGN, false); - if (ret) { - do_destroy(ctx); - return ret; - } + if (ret) + goto err_destroy; ret = i915_gem_object_set_to_gtt_domain(ctx->obj, true); - if (ret) { - i915_gem_object_unpin(ctx->obj); - do_destroy(ctx); - return ret; - } + if (ret) + goto err_unpin; - ret = do_switch(NULL, ctx, 0); - if (ret) { - i915_gem_object_unpin(ctx->obj); - do_destroy(ctx); - } else { - DRM_DEBUG_DRIVER("Default HW context loaded\n"); - } + ret = do_switch(ctx); + if (ret) + goto err_unpin; + DRM_DEBUG_DRIVER("Default HW context loaded\n"); + return 0; + +err_unpin: + i915_gem_object_unpin(ctx->obj); +err_destroy: + do_destroy(ctx); return ret; } @@ -366,16 +363,14 @@ mi_set_context(struct intel_ring_buffer *ring, return ret; } -static int do_switch(struct drm_i915_gem_object *from_obj, - struct i915_hw_context *to, - u32 seqno) +static int do_switch(struct i915_hw_context *to) { - struct intel_ring_buffer *ring = NULL; + struct drm_i915_gem_object *from_obj = to->ring->last_context_obj; u32 hw_flags = 0; int ret; - BUG_ON(to == NULL); - BUG_ON(from_obj != NULL && from_obj->pin_count == 0); + if (from_obj == to->obj) + return 0; ret = i915_gem_object_pin(to->obj, CONTEXT_ALIGN, false); if (ret) @@ -389,8 +384,7 @@ static int do_switch(struct drm_i915_gem_object *from_obj, else if (WARN_ON_ONCE(from_obj == to->obj)) /* not yet expected */ hw_flags |= MI_FORCE_RESTORE; - ring = to->ring; - ret = mi_set_context(ring, to, hw_flags); + ret = mi_set_context(to->ring, to, hw_flags); if (ret) { i915_gem_object_unpin(to->obj); return ret; @@ -403,6 +397,7 @@ static int do_switch(struct drm_i915_gem_object *from_obj, * MI_SET_CONTEXT instead of when the next seqno has completed. */ if (from_obj != NULL) { + u32 seqno = i915_gem_next_request_seqno(to->ring); /* As long as MI_SET_CONTEXT is serializing, ie. it flushes the * whole damn pipeline, we don't need to explicitly mark the * object dirty. The only exception is that the context must be @@ -412,13 +407,15 @@ static int do_switch(struct drm_i915_gem_object *from_obj, */ from_obj->base.write_domain = I915_GEM_DOMAIN_INSTRUCTION; from_obj->base.read_domains = I915_GEM_DOMAIN_INSTRUCTION; - i915_gem_object_move_to_active(from_obj, ring, seqno); + i915_gem_object_move_to_active(from_obj, to->ring, seqno); from_obj->dirty = 1; BUG_ON(from_obj->ring != to->ring); i915_gem_object_unpin(from_obj); + drm_gem_object_unreference(&from_obj->base); } - ring->last_context_obj = to->obj; + drm_gem_object_reference(&to->obj->base); + to->ring->last_context_obj = to->obj; to->is_initialized = true; return 0; @@ -442,10 +439,7 @@ int i915_switch_context(struct intel_ring_buffer *ring, int to_id) { struct drm_i915_private *dev_priv = ring->dev->dev_private; - struct drm_i915_file_private *file_priv = NULL; struct i915_hw_context *to; - struct drm_i915_gem_object *from_obj = ring->last_context_obj; - int ret; if (dev_priv->hw_contexts_disabled) return 0; @@ -453,34 +447,18 @@ int i915_switch_context(struct intel_ring_buffer *ring, if (ring != &dev_priv->ring[RCS]) return 0; - if (file) - file_priv = file->driver_priv; - if (to_id == DEFAULT_CONTEXT_ID) { to = ring->default_context; } else { - to = i915_gem_context_get(file_priv, to_id); + if (file == NULL) + return -EINVAL; + + to = i915_gem_context_get(file->driver_priv, to_id); if (to == NULL) return -ENOENT; } - if (from_obj == to->obj) - return 0; - - ret = do_switch(from_obj, to, i915_gem_next_request_seqno(to->ring)); - if (ret) - return ret; - - /* Just to make the code a little cleaner we take the object reference - * after the switch was successful. It would be more intuitive to ref - * the 'to' object before the switch but we know the refcount must be >0 - * if context_get() succeeded, and we hold struct mutex. So it's safe to - * do this here/now - */ - drm_gem_object_reference(&to->obj->base); - if (from_obj != NULL) - drm_gem_object_unreference(&from_obj->base); - return ret; + return do_switch(to); } int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
Otherwise we end up trying to unpin a freed object and BUG. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Ben Widawsky <ben@bwidawsk.net> --- drivers/gpu/drm/i915/i915_gem_context.c | 82 +++++++++++-------------------- 1 file changed, 30 insertions(+), 52 deletions(-)