diff mbox series

drm/i915/ttm: Fix incorrect assumptions about ttm_bo_validate() semantics

Message ID 20210618083117.158081-1-thomas.hellstrom@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/ttm: Fix incorrect assumptions about ttm_bo_validate() semantics | expand

Commit Message

Thomas Hellstrom June 18, 2021, 8:31 a.m. UTC
We have assumed that if the current placement was not the requested
placement, but instead one of the busy placements, a TTM move would have
been triggered. That is not the case.

So when we initially place LMEM objects in "Limbo", (that is system
placement without any pages allocated), to be able to defer clearing
objects until first get_pages(), the first get_pages() would happily keep
objects in system memory if that is one of the allowed placements. And
since we don't yet support i915 GEM system memory from TTM, everything
breaks apart.

So make sure we try the requested placement first, if no eviction is
needed. If that fails, retry with all allowed placements also allowing
evictions. Also make sure we handle TTM failure codes correctly.

Also temporarily (until we support i915 GEM system on TTM), restrict
allowed placements to the requested placement to avoid things falling
apart should LMEM be full.

Fixes: 38f28c0695c0 ("drm/i915/ttm: Calculate the object placement at get_pages time)
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 61 +++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 3 deletions(-)

Comments

Matthew Auld June 18, 2021, 10:53 a.m. UTC | #1
On Fri, 18 Jun 2021 at 09:31, Thomas Hellström
<thomas.hellstrom@linux.intel.com> wrote:
>
> We have assumed that if the current placement was not the requested
> placement, but instead one of the busy placements, a TTM move would have
> been triggered. That is not the case.
>
> So when we initially place LMEM objects in "Limbo", (that is system
> placement without any pages allocated), to be able to defer clearing
> objects until first get_pages(), the first get_pages() would happily keep
> objects in system memory if that is one of the allowed placements. And
> since we don't yet support i915 GEM system memory from TTM, everything
> breaks apart.
>
> So make sure we try the requested placement first, if no eviction is
> needed. If that fails, retry with all allowed placements also allowing
> evictions. Also make sure we handle TTM failure codes correctly.
>
> Also temporarily (until we support i915 GEM system on TTM), restrict
> allowed placements to the requested placement to avoid things falling
> apart should LMEM be full.
>
> Fixes: 38f28c0695c0 ("drm/i915/ttm: Calculate the object placement at get_pages time)
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 61 +++++++++++++++++++++++--
>  1 file changed, 58 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index df46535cca47..4bb0440f693c 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -64,6 +64,30 @@ static struct ttm_placement i915_sys_placement = {
>         .busy_placement = &sys_placement_flags,
>  };
>
> +static int i915_ttm_err_to_gem(int err)
> +{
> +       /* Fastpath */
> +       if (likely(!err))
> +               return 0;
> +
> +       switch (err) {
> +       case -EBUSY:
> +               /*
> +                * TTM likes to convert -EDEADLK to -EBUSY, and wants us to
> +                * restart the operation, since we don't record the contending
> +                * lock. We use -EAGAIN to restart.
> +                */
> +               return -EAGAIN;
> +       case -ENOSPC:
> +               /* Memory type / region is full, and we can't evict. */
> +               return -ENXIO;

ttm system will return -ENOMEM right?

Reviewed-by: Matthew Auld <matthew.auld@intel.com>

> +       default:
> +               break;
> +       }
> +
> +       return err;
> +}
> +
>  static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
>
>  static enum ttm_caching
> @@ -522,15 +546,46 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>         struct sg_table *st;
>         struct ttm_place requested, busy[I915_TTM_MAX_PLACEMENTS];
>         struct ttm_placement placement;
> +       int real_num_busy;
>         int ret;
>
>         GEM_BUG_ON(obj->mm.n_placements > I915_TTM_MAX_PLACEMENTS);
>
>         /* Move to the requested placement. */
>         i915_ttm_placement_from_obj(obj, &requested, busy, &placement);
> +
> +       /*
> +        * For now we support LMEM only with TTM.
> +        * TODO: Remove with system support
> +        */
> +       GEM_BUG_ON(requested.mem_type < I915_PL_LMEM0 ||
> +                  busy[0].mem_type < I915_PL_LMEM0);
> +
> +       /* First try only the requested placement. No eviction. */
> +       real_num_busy = fetch_and_zero(&placement.num_busy_placement);
>         ret = ttm_bo_validate(bo, &placement, &ctx);
> -       if (ret)
> -               return ret == -ENOSPC ? -ENXIO : ret;
> +       if (ret) {
> +               ret = i915_ttm_err_to_gem(ret);
> +               /*
> +                * Anything that wants to restart the operation gets to
> +                * do that.
> +                */
> +               if (ret == -EDEADLK || ret == -EINTR || ret == -ERESTARTSYS ||
> +                   ret == -EAGAIN)
> +                       return ret;
> +
> +               /* TODO: Remove this when we support system as TTM. */
> +               real_num_busy = 1;
> +
> +               /*
> +                * If the initial attempt fails, allow all accepted placements,
> +                * evicting if necessary.
> +                */
> +               placement.num_busy_placement = real_num_busy;
> +               ret = ttm_bo_validate(bo, &placement, &ctx);
> +               if (ret)
> +                       return i915_ttm_err_to_gem(ret);
> +       }
>
>         /* Object either has a page vector or is an iomem object */
>         st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
> @@ -741,5 +796,5 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>                 obj->ttm.created = true;
>
>         /* i915 wants -ENXIO when out of memory region space. */
> -       return (ret == -ENOSPC) ? -ENXIO : ret;
> +       return i915_ttm_err_to_gem(ret);
>  }
> --
> 2.31.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Thomas Hellstrom June 18, 2021, 12:07 p.m. UTC | #2
On 6/18/21 12:26 PM, Patchwork wrote:
> Project List - Patchwork *Patch Details*
> *Series:* 	drm/i915/ttm: Fix incorrect assumptions about 
> ttm_bo_validate() semantics
> *URL:* 	https://patchwork.freedesktop.org/series/91661/ 
> <https://patchwork.freedesktop.org/series/91661/>
> *State:* 	failure
> *Details:* 
> https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_20407/index.html 
> <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_20407/index.html>
>
>
>   CI Bug Log - changes from CI_DRM_10242_full -> Patchwork_20407_full
>
>
>     Summary
>
> *FAILURE*
>
> Serious unknown changes coming with Patchwork_20407_full absolutely 
> need to be
> verified manually.
>
> If you think the reported changes have nothing to do with the changes
> introduced in Patchwork_20407_full, please notify your bug team to 
> allow them
> to document this new failure mode, which will reduce false positives 
> in CI.
>
>
>     Possible new issues
>
> Here are the unknown changes that may have been introduced in 
> Patchwork_20407_full:
>
>
>       IGT changes
>
>
>         Possible regressions
>
>  *
>
>     igt@gem_exec_reloc@basic-wide-active@vcs1:
>
>       o shard-iclb: NOTRUN -> FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_20407/shard-iclb2/igt@gem_exec_reloc@basic-wide-active@vcs1.html>
>  *
>
>     igt@kms_big_fb@x-tiled-8bpp-rotate-0:
>
>       o shard-glk: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_10242/shard-glk3/igt@kms_big_fb@x-tiled-8bpp-rotate-0.html>
>         -> FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_20407/shard-glk1/igt@kms_big_fb@x-tiled-8bpp-rotate-0.html>
>
Failures not related AFAICT.
Thomas Hellstrom June 18, 2021, 12:41 p.m. UTC | #3
On 6/18/21 12:53 PM, Matthew Auld wrote:
> On Fri, 18 Jun 2021 at 09:31, Thomas Hellström
> <thomas.hellstrom@linux.intel.com> wrote:
>> We have assumed that if the current placement was not the requested
>> placement, but instead one of the busy placements, a TTM move would have
>> been triggered. That is not the case.
>>
>> So when we initially place LMEM objects in "Limbo", (that is system
>> placement without any pages allocated), to be able to defer clearing
>> objects until first get_pages(), the first get_pages() would happily keep
>> objects in system memory if that is one of the allowed placements. And
>> since we don't yet support i915 GEM system memory from TTM, everything
>> breaks apart.
>>
>> So make sure we try the requested placement first, if no eviction is
>> needed. If that fails, retry with all allowed placements also allowing
>> evictions. Also make sure we handle TTM failure codes correctly.
>>
>> Also temporarily (until we support i915 GEM system on TTM), restrict
>> allowed placements to the requested placement to avoid things falling
>> apart should LMEM be full.
>>
>> Fixes: 38f28c0695c0 ("drm/i915/ttm: Calculate the object placement at get_pages time)
>> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 61 +++++++++++++++++++++++--
>>   1 file changed, 58 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> index df46535cca47..4bb0440f693c 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> @@ -64,6 +64,30 @@ static struct ttm_placement i915_sys_placement = {
>>          .busy_placement = &sys_placement_flags,
>>   };
>>
>> +static int i915_ttm_err_to_gem(int err)
>> +{
>> +       /* Fastpath */
>> +       if (likely(!err))
>> +               return 0;
>> +
>> +       switch (err) {
>> +       case -EBUSY:
>> +               /*
>> +                * TTM likes to convert -EDEADLK to -EBUSY, and wants us to
>> +                * restart the operation, since we don't record the contending
>> +                * lock. We use -EAGAIN to restart.
>> +                */
>> +               return -EAGAIN;
>> +       case -ENOSPC:
>> +               /* Memory type / region is full, and we can't evict. */
>> +               return -ENXIO;
> ttm system will return -ENOMEM right?
>
> Reviewed-by: Matthew Auld <matthew.auld@intel.com>

Hmm, Yes, I suppose so. Will that need some mangling before handing over 
to GEM?

Thanks for reviewing!

Thomas
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index df46535cca47..4bb0440f693c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -64,6 +64,30 @@  static struct ttm_placement i915_sys_placement = {
 	.busy_placement = &sys_placement_flags,
 };
 
+static int i915_ttm_err_to_gem(int err)
+{
+	/* Fastpath */
+	if (likely(!err))
+		return 0;
+
+	switch (err) {
+	case -EBUSY:
+		/*
+		 * TTM likes to convert -EDEADLK to -EBUSY, and wants us to
+		 * restart the operation, since we don't record the contending
+		 * lock. We use -EAGAIN to restart.
+		 */
+		return -EAGAIN;
+	case -ENOSPC:
+		/* Memory type / region is full, and we can't evict. */
+		return -ENXIO;
+	default:
+		break;
+	}
+
+	return err;
+}
+
 static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
 
 static enum ttm_caching
@@ -522,15 +546,46 @@  static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
 	struct sg_table *st;
 	struct ttm_place requested, busy[I915_TTM_MAX_PLACEMENTS];
 	struct ttm_placement placement;
+	int real_num_busy;
 	int ret;
 
 	GEM_BUG_ON(obj->mm.n_placements > I915_TTM_MAX_PLACEMENTS);
 
 	/* Move to the requested placement. */
 	i915_ttm_placement_from_obj(obj, &requested, busy, &placement);
+
+	/*
+	 * For now we support LMEM only with TTM.
+	 * TODO: Remove with system support
+	 */
+	GEM_BUG_ON(requested.mem_type < I915_PL_LMEM0 ||
+		   busy[0].mem_type < I915_PL_LMEM0);
+
+	/* First try only the requested placement. No eviction. */
+	real_num_busy = fetch_and_zero(&placement.num_busy_placement);
 	ret = ttm_bo_validate(bo, &placement, &ctx);
-	if (ret)
-		return ret == -ENOSPC ? -ENXIO : ret;
+	if (ret) {
+		ret = i915_ttm_err_to_gem(ret);
+		/*
+		 * Anything that wants to restart the operation gets to
+		 * do that.
+		 */
+		if (ret == -EDEADLK || ret == -EINTR || ret == -ERESTARTSYS ||
+		    ret == -EAGAIN)
+			return ret;
+
+		/* TODO: Remove this when we support system as TTM. */
+		real_num_busy = 1;
+
+		/*
+		 * If the initial attempt fails, allow all accepted placements,
+		 * evicting if necessary.
+		 */
+		placement.num_busy_placement = real_num_busy;
+		ret = ttm_bo_validate(bo, &placement, &ctx);
+		if (ret)
+			return i915_ttm_err_to_gem(ret);
+	}
 
 	/* Object either has a page vector or is an iomem object */
 	st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
@@ -741,5 +796,5 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 		obj->ttm.created = true;
 
 	/* i915 wants -ENXIO when out of memory region space. */
-	return (ret == -ENOSPC) ? -ENXIO : ret;
+	return i915_ttm_err_to_gem(ret);
 }