diff mbox series

[v7,2/3] drm/i915/ttm: Adjust gem flags and caching settings after a move

Message ID 20210622093418.153400-3-thomas.hellstrom@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Move system memory to TTM for discrete | expand

Commit Message

Thomas Hellström June 22, 2021, 9:34 a.m. UTC
After a TTM move or object init we need to update the i915 gem flags and
caching settings to reflect the new placement. Currently caching settings
are not changed during the lifetime of an object, although that might
change moving forward if we run into performance issues or issues with
WC system page allocations.
Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
various ways we previously used to detect this.
Finally, initialize the TTM object reserved to be able to update
flags and caching before anyone else gets hold of the object.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
---
v6:
- Rebase on accelerated ttm moves.
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 143 ++++++++++++++++++------
 1 file changed, 107 insertions(+), 36 deletions(-)

Comments

Matthew Auld June 22, 2021, 9:44 a.m. UTC | #1
On Tue, 22 Jun 2021 at 10:34, Thomas Hellström
<thomas.hellstrom@linux.intel.com> wrote:
>
> After a TTM move or object init we need to update the i915 gem flags and
> caching settings to reflect the new placement. Currently caching settings
> are not changed during the lifetime of an object, although that might
> change moving forward if we run into performance issues or issues with
> WC system page allocations.
> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
> various ways we previously used to detect this.
> Finally, initialize the TTM object reserved to be able to update
> flags and caching before anyone else gets hold of the object.
>
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> ---
> v6:
> - Rebase on accelerated ttm moves.
> ---

<snip>

> @@ -775,14 +845,13 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>         i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
>         i915_gem_object_init_memory_region(obj, mem);
>         i915_gem_object_make_unshrinkable(obj);
> -       obj->read_domains = I915_GEM_DOMAIN_WC | I915_GEM_DOMAIN_GTT;
> -       obj->mem_flags |= I915_BO_FLAG_IOMEM;
> -       i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
>         INIT_RADIX_TREE(&obj->ttm.get_io_page.radix, GFP_KERNEL | __GFP_NOWARN);
>         mutex_init(&obj->ttm.get_io_page.lock);
>         bo_type = (obj->flags & I915_BO_ALLOC_USER) ? ttm_bo_type_device :
>                 ttm_bo_type_kernel;
>
> +       obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
> +
>         /*
>          * If this function fails, it will call the destructor, but
>          * our caller still owns the object. So no freeing in the
> @@ -790,14 +859,16 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>          * Similarly, in delayed_destroy, we can't call ttm_bo_put()
>          * until successful initialization.
>          */
> -       obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
> -       ret = ttm_bo_init(&i915->bdev, i915_gem_to_ttm(obj), size,
> -                         bo_type, &i915_sys_placement,
> -                         mem->min_page_size >> PAGE_SHIFT,
> -                         true, NULL, NULL, i915_ttm_bo_destroy);
> -       if (!ret)
> -               obj->ttm.created = true;
> -
> -       /* i915 wants -ENXIO when out of memory region space. */
> -       return i915_ttm_err_to_gem(ret);
> +       ret = ttm_bo_init_reserved(&i915->bdev, i915_gem_to_ttm(obj), size,
> +                                  bo_type, &i915_sys_placement, 1,

mem->min_page_size >> PAGE_SHIFT? Although just realised that looks
iffy since it only considers the current region, when it should
consider all future placements. I wonder if it makes sense to make
page_alignment part of ttm_place? Anyway, it doesn't matter for this
series.

> +                                  &ctx, NULL, NULL, i915_ttm_bo_destroy);
> +       if (ret)
> +               return i915_ttm_err_to_gem(ret);
> +
> +       obj->ttm.created = true;
> +       i915_ttm_adjust_domains_after_move(obj);
> +       i915_ttm_adjust_gem_after_move(obj);
> +       i915_gem_object_unlock(obj);
> +
> +       return 0;
>  }
> --
> 2.31.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Thomas Hellström June 22, 2021, 10:07 a.m. UTC | #2
On 6/22/21 11:44 AM, Matthew Auld wrote:
> On Tue, 22 Jun 2021 at 10:34, Thomas Hellström
> <thomas.hellstrom@linux.intel.com> wrote:
>> After a TTM move or object init we need to update the i915 gem flags and
>> caching settings to reflect the new placement. Currently caching settings
>> are not changed during the lifetime of an object, although that might
>> change moving forward if we run into performance issues or issues with
>> WC system page allocations.
>> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
>> various ways we previously used to detect this.
>> Finally, initialize the TTM object reserved to be able to update
>> flags and caching before anyone else gets hold of the object.
>>
>> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>> ---
>> v6:
>> - Rebase on accelerated ttm moves.
>> ---
> <snip>
>
>> @@ -775,14 +845,13 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>>          i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
>>          i915_gem_object_init_memory_region(obj, mem);
>>          i915_gem_object_make_unshrinkable(obj);
>> -       obj->read_domains = I915_GEM_DOMAIN_WC | I915_GEM_DOMAIN_GTT;
>> -       obj->mem_flags |= I915_BO_FLAG_IOMEM;
>> -       i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
>>          INIT_RADIX_TREE(&obj->ttm.get_io_page.radix, GFP_KERNEL | __GFP_NOWARN);
>>          mutex_init(&obj->ttm.get_io_page.lock);
>>          bo_type = (obj->flags & I915_BO_ALLOC_USER) ? ttm_bo_type_device :
>>                  ttm_bo_type_kernel;
>>
>> +       obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
>> +
>>          /*
>>           * If this function fails, it will call the destructor, but
>>           * our caller still owns the object. So no freeing in the
>> @@ -790,14 +859,16 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>>           * Similarly, in delayed_destroy, we can't call ttm_bo_put()
>>           * until successful initialization.
>>           */
>> -       obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
>> -       ret = ttm_bo_init(&i915->bdev, i915_gem_to_ttm(obj), size,
>> -                         bo_type, &i915_sys_placement,
>> -                         mem->min_page_size >> PAGE_SHIFT,
>> -                         true, NULL, NULL, i915_ttm_bo_destroy);
>> -       if (!ret)
>> -               obj->ttm.created = true;
>> -
>> -       /* i915 wants -ENXIO when out of memory region space. */
>> -       return i915_ttm_err_to_gem(ret);
>> +       ret = ttm_bo_init_reserved(&i915->bdev, i915_gem_to_ttm(obj), size,
>> +                                  bo_type, &i915_sys_placement, 1,
> mem->min_page_size >> PAGE_SHIFT? Although just realised that looks
> iffy since it only considers the current region, when it should
> consider all future placements. I wonder if it makes sense to make
> page_alignment part of ttm_place? Anyway, it doesn't matter for this
> series.

Good catch. Yes completely agree it should be part of ttm_place. But 
extending ttm_place and audit all drivers to always clear unused parts 
of ttm_place is a big task.

But it's also the case that the region manager is allowed to enforce an 
alignment, unknown to us here, so we might want to take that approach to 
begin with?

/Thomas
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index b5dd3b7037f4..966b292d07da 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -91,6 +91,26 @@  static int i915_ttm_err_to_gem(int err)
 	return err;
 }
 
+static bool gpu_binds_iomem(struct ttm_resource *mem)
+{
+	return mem->mem_type != TTM_PL_SYSTEM;
+}
+
+static bool cpu_maps_iomem(struct ttm_resource *mem)
+{
+	/* Once / if we support GGTT, this is also false for cached ttm_tts */
+	return mem->mem_type != TTM_PL_SYSTEM;
+}
+
+static enum i915_cache_level
+i915_ttm_cache_level(struct drm_i915_private *i915, struct ttm_resource *res,
+		     struct ttm_tt *ttm)
+{
+	return ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(res) &&
+		ttm->caching == ttm_cached) ? I915_CACHE_LLC :
+		I915_CACHE_NONE;
+}
+
 static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
 
 static enum ttm_caching
@@ -248,6 +268,35 @@  static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj)
 	obj->ttm.cached_io_st = NULL;
 }
 
+static void
+i915_ttm_adjust_domains_after_move(struct drm_i915_gem_object *obj)
+{
+	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
+
+	if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) {
+		obj->write_domain = I915_GEM_DOMAIN_WC;
+		obj->read_domains = I915_GEM_DOMAIN_WC;
+	} else {
+		obj->write_domain = I915_GEM_DOMAIN_CPU;
+		obj->read_domains = I915_GEM_DOMAIN_CPU;
+	}
+}
+
+static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
+{
+	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
+	unsigned int cache_level;
+
+	obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM);
+
+	obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM :
+		I915_BO_FLAG_STRUCT_PAGE;
+
+	cache_level = i915_ttm_cache_level(to_i915(bo->base.dev), bo->resource,
+					   bo->ttm);
+	i915_gem_object_set_cache_coherency(obj, cache_level);
+}
+
 static void i915_ttm_purge(struct drm_i915_gem_object *obj)
 {
 	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
@@ -263,8 +312,10 @@  static void i915_ttm_purge(struct drm_i915_gem_object *obj)
 
 	/* TTM's purge interface. Note that we might be reentering. */
 	ret = ttm_bo_validate(bo, &place, &ctx);
-
 	if (!ret) {
+		obj->write_domain = 0;
+		obj->read_domains = 0;
+		i915_ttm_adjust_gem_after_move(obj);
 		i915_ttm_free_cached_io_st(obj);
 		obj->mm.madv = __I915_MADV_PURGED;
 	}
@@ -347,12 +398,15 @@  i915_ttm_resource_get_st(struct drm_i915_gem_object *obj,
 			 struct ttm_resource *res)
 {
 	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
-	struct ttm_resource_manager *man =
-		ttm_manager_type(bo->bdev, res->mem_type);
 
-	if (man->use_tt)
+	if (!gpu_binds_iomem(res))
 		return i915_ttm_tt_get_st(bo->ttm);
 
+	/*
+	 * If CPU mapping differs, we need to add the ttm_tt pages to
+	 * the resulting st. Might make sense for GGTT.
+	 */
+	GEM_WARN_ON(!cpu_maps_iomem(res));
 	return intel_region_ttm_resource_to_st(obj->mm.region, res);
 }
 
@@ -367,23 +421,25 @@  static int i915_ttm_accel_move(struct ttm_buffer_object *bo,
 	struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
 	struct sg_table *src_st;
 	struct i915_request *rq;
+	struct ttm_tt *ttm = bo->ttm;
+	enum i915_cache_level src_level, dst_level;
 	int ret;
 
 	if (!i915->gt.migrate.context)
 		return -EINVAL;
 
-	if (!bo->ttm || !ttm_tt_is_populated(bo->ttm)) {
+	dst_level = i915_ttm_cache_level(i915, dst_mem, ttm);
+	if (!ttm || !ttm_tt_is_populated(ttm)) {
 		if (bo->type == ttm_bo_type_kernel)
 			return -EINVAL;
 
-		if (bo->ttm &&
-		    !(bo->ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC))
+		if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC))
 			return 0;
 
 		intel_engine_pm_get(i915->gt.migrate.context->engine);
 		ret = intel_context_migrate_clear(i915->gt.migrate.context, NULL,
-						  dst_st->sgl, I915_CACHE_NONE,
-						  dst_mem->mem_type >= I915_PL_LMEM0,
+						  dst_st->sgl, dst_level,
+						  gpu_binds_iomem(dst_mem),
 						  0, &rq);
 
 		if (!ret && rq) {
@@ -392,15 +448,16 @@  static int i915_ttm_accel_move(struct ttm_buffer_object *bo,
 		}
 		intel_engine_pm_put(i915->gt.migrate.context->engine);
 	} else {
-		src_st = src_man->use_tt ? i915_ttm_tt_get_st(bo->ttm) :
-						obj->ttm.cached_io_st;
+		src_st = src_man->use_tt ? i915_ttm_tt_get_st(ttm) :
+			obj->ttm.cached_io_st;
 
+		src_level = i915_ttm_cache_level(i915, bo->resource, ttm);
 		intel_engine_pm_get(i915->gt.migrate.context->engine);
 		ret = intel_context_migrate_copy(i915->gt.migrate.context,
-						 NULL, src_st->sgl, I915_CACHE_NONE,
-						 bo->resource->mem_type >= I915_PL_LMEM0,
-						 dst_st->sgl, I915_CACHE_NONE,
-						 dst_mem->mem_type >= I915_PL_LMEM0,
+						 NULL, src_st->sgl, src_level,
+						 gpu_binds_iomem(bo->resource),
+						 dst_st->sgl, dst_level,
+						 gpu_binds_iomem(dst_mem),
 						 &rq);
 		if (!ret && rq) {
 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
@@ -420,8 +477,6 @@  static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
 	struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
 	struct ttm_resource_manager *dst_man =
 		ttm_manager_type(bo->bdev, dst_mem->mem_type);
-	struct ttm_resource_manager *src_man =
-		ttm_manager_type(bo->bdev, bo->resource->mem_type);
 	struct intel_memory_region *dst_reg, *src_reg;
 	union {
 		struct ttm_kmap_iter_tt tt;
@@ -465,12 +520,12 @@  static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
 	ret = i915_ttm_accel_move(bo, dst_mem, dst_st);
 	if (ret) {
 		/* If we start mapping GGTT, we can no longer use man::use_tt here. */
-		dst_iter = dst_man->use_tt ?
+		dst_iter = !cpu_maps_iomem(dst_mem) ?
 			ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) :
 			ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap,
 						 dst_st, dst_reg->region.start);
 
-		src_iter = src_man->use_tt ?
+		src_iter = !cpu_maps_iomem(bo->resource) ?
 			ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) :
 			ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap,
 						 obj->ttm.cached_io_st,
@@ -478,21 +533,24 @@  static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
 
 		ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
 	}
+	/* Below dst_mem becomes bo->resource. */
 	ttm_bo_move_sync_cleanup(bo, dst_mem);
+	i915_ttm_adjust_domains_after_move(obj);
 	i915_ttm_free_cached_io_st(obj);
 
-	if (!dst_man->use_tt) {
+	if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) {
 		obj->ttm.cached_io_st = dst_st;
 		obj->ttm.get_io_page.sg_pos = dst_st->sgl;
 		obj->ttm.get_io_page.sg_idx = 0;
 	}
 
+	i915_ttm_adjust_gem_after_move(obj);
 	return 0;
 }
 
 static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
 {
-	if (mem->mem_type < I915_PL_LMEM0)
+	if (!cpu_maps_iomem(mem))
 		return 0;
 
 	mem->bus.caching = ttm_write_combined;
@@ -590,6 +648,16 @@  static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
 			return i915_ttm_err_to_gem(ret);
 	}
 
+	i915_ttm_adjust_lru(obj);
+	if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) {
+		ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx);
+		if (ret)
+			return ret;
+
+		i915_ttm_adjust_domains_after_move(obj);
+		i915_ttm_adjust_gem_after_move(obj);
+	}
+
 	/* Object either has a page vector or is an iomem object */
 	st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
 	if (IS_ERR(st))
@@ -597,8 +665,6 @@  static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
 
 	__i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl));
 
-	i915_ttm_adjust_lru(obj);
-
 	return ret;
 }
 
@@ -768,6 +834,10 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 {
 	static struct lock_class_key lock_class;
 	struct drm_i915_private *i915 = mem->i915;
+	struct ttm_operation_ctx ctx = {
+		.interruptible = true,
+		.no_wait_gpu = false,
+	};
 	enum ttm_bo_type bo_type;
 	int ret;
 
@@ -775,14 +845,13 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 	i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
 	i915_gem_object_init_memory_region(obj, mem);
 	i915_gem_object_make_unshrinkable(obj);
-	obj->read_domains = I915_GEM_DOMAIN_WC | I915_GEM_DOMAIN_GTT;
-	obj->mem_flags |= I915_BO_FLAG_IOMEM;
-	i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
 	INIT_RADIX_TREE(&obj->ttm.get_io_page.radix, GFP_KERNEL | __GFP_NOWARN);
 	mutex_init(&obj->ttm.get_io_page.lock);
 	bo_type = (obj->flags & I915_BO_ALLOC_USER) ? ttm_bo_type_device :
 		ttm_bo_type_kernel;
 
+	obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
+
 	/*
 	 * If this function fails, it will call the destructor, but
 	 * our caller still owns the object. So no freeing in the
@@ -790,14 +859,16 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 	 * Similarly, in delayed_destroy, we can't call ttm_bo_put()
 	 * until successful initialization.
 	 */
-	obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
-	ret = ttm_bo_init(&i915->bdev, i915_gem_to_ttm(obj), size,
-			  bo_type, &i915_sys_placement,
-			  mem->min_page_size >> PAGE_SHIFT,
-			  true, NULL, NULL, i915_ttm_bo_destroy);
-	if (!ret)
-		obj->ttm.created = true;
-
-	/* i915 wants -ENXIO when out of memory region space. */
-	return i915_ttm_err_to_gem(ret);
+	ret = ttm_bo_init_reserved(&i915->bdev, i915_gem_to_ttm(obj), size,
+				   bo_type, &i915_sys_placement, 1,
+				   &ctx, NULL, NULL, i915_ttm_bo_destroy);
+	if (ret)
+		return i915_ttm_err_to_gem(ret);
+
+	obj->ttm.created = true;
+	i915_ttm_adjust_domains_after_move(obj);
+	i915_ttm_adjust_gem_after_move(obj);
+	i915_gem_object_unlock(obj);
+
+	return 0;
 }