diff mbox

drm/i915: Unwind partial VMA rebinding after failure in set-cache-level

Message ID 1444391498-14682-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Oct. 9, 2015, 11:51 a.m. UTC
If the impossible happens and we fail to rebind a VMA in the middle of
rebinding all VMA for an object we currently bail out and leave the
object in an inconsistent state. Attempt to unwind the incomplete update
by reverting all updated VMA back to the original cache-level, and WARN
if that fails.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

Comments

Tvrtko Ursulin Oct. 9, 2015, 12:11 p.m. UTC | #1
On 09/10/15 12:51, Chris Wilson wrote:
> If the impossible happens and we fail to rebind a VMA in the middle of
> rebinding all VMA for an object we currently bail out and leave the
> object in an inconsistent state. Attempt to unwind the incomplete update
> by reverting all updated VMA back to the original cache-level, and WARN
> if that fails.

Hey a BUG_ON would have been more your style! ;)

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 35 ++++++++++++++++++++++++++---------
>   1 file changed, 26 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1e67484fd5dc..24ba47a22260 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3664,6 +3664,12 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>   	struct i915_vma *vma, *next;
>   	int ret = 0;
>
> +	/* We manipulate all the PTEs in the various GTT associated with
> +	 * this object which requires that the caller takes the
> +	 * struct_mutex on our behalf.
> +	 */
> +	lockdep_assert_held(&dev->struct_mutex);
> +
>   	if (obj->cache_level == cache_level)
>   		goto out;
>
> @@ -3697,17 +3703,18 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>   				return ret;
>   		}
>
> -		list_for_each_entry(vma, &obj->vma_list, vma_link)
> -			if (drm_mm_node_allocated(&vma->node)) {
> -				ret = i915_vma_bind(vma, cache_level,
> -						    PIN_UPDATE);
> -				if (ret)
> -					return ret;
> -			}
> +		list_for_each_entry(vma, &obj->vma_list, vma_link) {
> +			if (!drm_mm_node_allocated(&vma->node))
> +				continue;
> +
> +			ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
> +			if (ret)
> +				goto unwind;
> +
> +			vma->node.color = cache_level;
> +		}
>   	}
>
> -	list_for_each_entry(vma, &obj->vma_list, vma_link)
> -		vma->node.color = cache_level;
>   	obj->cache_level = cache_level;
>
>   out:
> @@ -3719,6 +3726,16 @@ out:
>   	}
>
>   	return 0;
> +
> +unwind:
> +	list_for_each_entry_continue_reverse(vma, &obj->vma_list, vma_link) {

I did not know of this one! But it confuses me (emphasis mine):

"""
list_for_each_entry_continue_reverse - iterate backwards _from the given 
point_
"""

or

"""
  * Start to iterate over list of given type backwards, continuing _after
  * the current position_.
"""

Code is "for (pos = list_prev_entry(pos, member); " though, so I think 
you'll miss rebinding the one it failed on.

Regards,

Tvrtko
Chris Wilson Oct. 9, 2015, 12:18 p.m. UTC | #2
On Fri, Oct 09, 2015 at 01:11:47PM +0100, Tvrtko Ursulin wrote:
> 
> On 09/10/15 12:51, Chris Wilson wrote:
> >If the impossible happens and we fail to rebind a VMA in the middle of
> >rebinding all VMA for an object we currently bail out and leave the
> >object in an inconsistent state. Attempt to unwind the incomplete update
> >by reverting all updated VMA back to the original cache-level, and WARN
> >if that fails.
> 
> Hey a BUG_ON would have been more your style! ;)
> 
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_gem.c | 35 ++++++++++++++++++++++++++---------
> >  1 file changed, 26 insertions(+), 9 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >index 1e67484fd5dc..24ba47a22260 100644
> >--- a/drivers/gpu/drm/i915/i915_gem.c
> >+++ b/drivers/gpu/drm/i915/i915_gem.c
> >@@ -3664,6 +3664,12 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> >  	struct i915_vma *vma, *next;
> >  	int ret = 0;
> >
> >+	/* We manipulate all the PTEs in the various GTT associated with
> >+	 * this object which requires that the caller takes the
> >+	 * struct_mutex on our behalf.
> >+	 */
> >+	lockdep_assert_held(&dev->struct_mutex);
> >+
> >  	if (obj->cache_level == cache_level)
> >  		goto out;
> >
> >@@ -3697,17 +3703,18 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> >  				return ret;
> >  		}
> >
> >-		list_for_each_entry(vma, &obj->vma_list, vma_link)
> >-			if (drm_mm_node_allocated(&vma->node)) {
> >-				ret = i915_vma_bind(vma, cache_level,
> >-						    PIN_UPDATE);
> >-				if (ret)
> >-					return ret;
> >-			}
> >+		list_for_each_entry(vma, &obj->vma_list, vma_link) {
> >+			if (!drm_mm_node_allocated(&vma->node))
> >+				continue;
> >+
> >+			ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
> >+			if (ret)
> >+				goto unwind;
> >+
> >+			vma->node.color = cache_level;
> >+		}
> >  	}
> >
> >-	list_for_each_entry(vma, &obj->vma_list, vma_link)
> >-		vma->node.color = cache_level;
> >  	obj->cache_level = cache_level;
> >
> >  out:
> >@@ -3719,6 +3726,16 @@ out:
> >  	}
> >
> >  	return 0;
> >+
> >+unwind:
> >+	list_for_each_entry_continue_reverse(vma, &obj->vma_list, vma_link) {
> 
> I did not know of this one! But it confuses me (emphasis mine):
> 
> """
> list_for_each_entry_continue_reverse - iterate backwards _from the
> given point_
> """
> 
> or
> 
> """
>  * Start to iterate over list of given type backwards, continuing _after
>  * the current position_.
> """
> 
> Code is "for (pos = list_prev_entry(pos, member); " though, so I
> think you'll miss rebinding the one it failed on.

Correct, but we didn't change the one we failed upon. If i915_vma_bind()
itself doesn't unwind, we have another bug to fix! :)
-Chris
Chris Wilson Oct. 9, 2015, 12:19 p.m. UTC | #3
On Fri, Oct 09, 2015 at 01:11:47PM +0100, Tvrtko Ursulin wrote:
> 
> On 09/10/15 12:51, Chris Wilson wrote:
> >If the impossible happens and we fail to rebind a VMA in the middle of
> >rebinding all VMA for an object we currently bail out and leave the
> >object in an inconsistent state. Attempt to unwind the incomplete update
> >by reverting all updated VMA back to the original cache-level, and WARN
> >if that fails.
> 
> Hey a BUG_ON would have been more your style! ;)

Propagating error returns trumps throwing a tantrum and then as we are
on the the error path we've already proved the impossible could happen,
so presume it could happen again.
-Chris
Tvrtko Ursulin Oct. 9, 2015, 12:37 p.m. UTC | #4
On 09/10/15 13:19, Chris Wilson wrote:
> On Fri, Oct 09, 2015 at 01:11:47PM +0100, Tvrtko Ursulin wrote:
>>
>> On 09/10/15 12:51, Chris Wilson wrote:
>>> If the impossible happens and we fail to rebind a VMA in the middle of
>>> rebinding all VMA for an object we currently bail out and leave the
>>> object in an inconsistent state. Attempt to unwind the incomplete update
>>> by reverting all updated VMA back to the original cache-level, and WARN
>>> if that fails.
>>
>> Hey a BUG_ON would have been more your style! ;)
>
> Propagating error returns trumps throwing a tantrum and then as we are
> on the the error path we've already proved the impossible could happen,
> so presume it could happen again.

No no, just a BUG_ON(i915_bind_vma()) on the original call site would 
suffice since it cannot happen anyway. ;>

For the same reason it is safe to skip the current entry when reversing 
I suppose.

But it is still code to handle something which can't happen. If you want 
to stuff it under future proofing you can have my r-b, but hope someone 
who doesn't like future proofing notices. :)

Regards,

Tvrtko
Chris Wilson Oct. 9, 2015, 1:10 p.m. UTC | #5
On Fri, Oct 09, 2015 at 01:37:19PM +0100, Tvrtko Ursulin wrote:
> 
> On 09/10/15 13:19, Chris Wilson wrote:
> >On Fri, Oct 09, 2015 at 01:11:47PM +0100, Tvrtko Ursulin wrote:
> >>
> >>On 09/10/15 12:51, Chris Wilson wrote:
> >>>If the impossible happens and we fail to rebind a VMA in the middle of
> >>>rebinding all VMA for an object we currently bail out and leave the
> >>>object in an inconsistent state. Attempt to unwind the incomplete update
> >>>by reverting all updated VMA back to the original cache-level, and WARN
> >>>if that fails.
> >>
> >>Hey a BUG_ON would have been more your style! ;)
> >
> >Propagating error returns trumps throwing a tantrum and then as we are
> >on the the error path we've already proved the impossible could happen,
> >so presume it could happen again.
> 
> No no, just a BUG_ON(i915_bind_vma()) on the original call site
> would suffice since it cannot happen anyway. ;>
> 
> For the same reason it is safe to skip the current entry when
> reversing I suppose.
> 
> But it is still code to handle something which can't happen. If you
> want to stuff it under future proofing you can have my r-b, but hope
> someone who doesn't like future proofing notices. :)

Maybe I should say nothing is impossible with i915_gem_gtt.c :)
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1e67484fd5dc..24ba47a22260 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3664,6 +3664,12 @@  int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	struct i915_vma *vma, *next;
 	int ret = 0;
 
+	/* We manipulate all the PTEs in the various GTT associated with
+	 * this object which requires that the caller takes the
+	 * struct_mutex on our behalf.
+	 */
+	lockdep_assert_held(&dev->struct_mutex);
+
 	if (obj->cache_level == cache_level)
 		goto out;
 
@@ -3697,17 +3703,18 @@  int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 				return ret;
 		}
 
-		list_for_each_entry(vma, &obj->vma_list, vma_link)
-			if (drm_mm_node_allocated(&vma->node)) {
-				ret = i915_vma_bind(vma, cache_level,
-						    PIN_UPDATE);
-				if (ret)
-					return ret;
-			}
+		list_for_each_entry(vma, &obj->vma_list, vma_link) {
+			if (!drm_mm_node_allocated(&vma->node))
+				continue;
+
+			ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
+			if (ret)
+				goto unwind;
+
+			vma->node.color = cache_level;
+		}
 	}
 
-	list_for_each_entry(vma, &obj->vma_list, vma_link)
-		vma->node.color = cache_level;
 	obj->cache_level = cache_level;
 
 out:
@@ -3719,6 +3726,16 @@  out:
 	}
 
 	return 0;
+
+unwind:
+	list_for_each_entry_continue_reverse(vma, &obj->vma_list, vma_link) {
+		if (!drm_mm_node_allocated(&vma->node))
+			continue;
+
+		WARN_ON(i915_vma_bind(vma, obj->cache_level, PIN_UPDATE));
+		vma->node.color = obj->cache_level;
+	}
+	return ret;
 }
 
 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,