diff mbox series

[v2,06/21] drm/i915/gt: Batch TLB invalidations

Message ID 9f535a97f32320a213a619a30c961ba44b595453.1657800199.git.mchehab@kernel.org (mailing list archive)
State New, archived
Headers show
Series Fix performance regressions with TLB and add GuC support | expand

Commit Message

Mauro Carvalho Chehab July 14, 2022, 12:06 p.m. UTC
From: Chris Wilson <chris.p.wilson@intel.com>

Invalidate TLB in patch, in order to reduce performance regressions.

Currently, every caller performs a full barrier around a TLB
invalidation, ignoring all other invalidations that may have already
removed their PTEs from the cache. As this is a synchronous operation
and can be quite slow, we cause multiple threads to contend on the TLB
invalidate mutex blocking userspace.

We only need to invalidate the TLB once after replacing our PTE to
ensure that there is no possible continued access to the physical
address before releasing our pages. By tracking a seqno for each full
TLB invalidate we can quickly determine if one has been performed since
rewriting the PTE, and only if necessary trigger one for ourselves.

That helps to reduce the performance regression introduced by TLB
invalidate logic.

[mchehab: rebased to not require moving the code to a separate file]

Cc: stable@vger.kernel.org
Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Cc: Fei Yang <fei.yang@intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---

To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/

 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_pages.c     | 21 +++++---
 drivers/gpu/drm/i915/gt/intel_gt.c            | 53 ++++++++++++++-----
 drivers/gpu/drm/i915/gt/intel_gt.h            | 12 ++++-
 drivers/gpu/drm/i915/gt/intel_gt_types.h      | 18 ++++++-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c         |  8 ++-
 drivers/gpu/drm/i915/i915_vma.c               | 34 +++++++++---
 drivers/gpu/drm/i915/i915_vma.h               |  1 +
 drivers/gpu/drm/i915/i915_vma_resource.c      |  5 +-
 drivers/gpu/drm/i915/i915_vma_resource.h      |  6 ++-
 10 files changed, 125 insertions(+), 36 deletions(-)

Comments

Tvrtko Ursulin July 18, 2022, 1:52 p.m. UTC | #1
On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> Invalidate TLB in patch, in order to reduce performance regressions.

"in batches"?

> Currently, every caller performs a full barrier around a TLB
> invalidation, ignoring all other invalidations that may have already
> removed their PTEs from the cache. As this is a synchronous operation
> and can be quite slow, we cause multiple threads to contend on the TLB
> invalidate mutex blocking userspace.
> 
> We only need to invalidate the TLB once after replacing our PTE to
> ensure that there is no possible continued access to the physical
> address before releasing our pages. By tracking a seqno for each full
> TLB invalidate we can quickly determine if one has been performed since
> rewriting the PTE, and only if necessary trigger one for ourselves.
> 
> That helps to reduce the performance regression introduced by TLB
> invalidate logic.
> 
> [mchehab: rebased to not require moving the code to a separate file]
> 
> Cc: stable@vger.kernel.org
> Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Cc: Fei Yang <fei.yang@intel.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
> ---
> 
> To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
> See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/
> 
>   .../gpu/drm/i915/gem/i915_gem_object_types.h  |  3 +-
>   drivers/gpu/drm/i915/gem/i915_gem_pages.c     | 21 +++++---
>   drivers/gpu/drm/i915/gt/intel_gt.c            | 53 ++++++++++++++-----
>   drivers/gpu/drm/i915/gt/intel_gt.h            | 12 ++++-
>   drivers/gpu/drm/i915/gt/intel_gt_types.h      | 18 ++++++-
>   drivers/gpu/drm/i915/gt/intel_ppgtt.c         |  8 ++-
>   drivers/gpu/drm/i915/i915_vma.c               | 34 +++++++++---
>   drivers/gpu/drm/i915/i915_vma.h               |  1 +
>   drivers/gpu/drm/i915/i915_vma_resource.c      |  5 +-
>   drivers/gpu/drm/i915/i915_vma_resource.h      |  6 ++-
>   10 files changed, 125 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> index 5cf36a130061..9f6b14ec189a 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> @@ -335,7 +335,6 @@ struct drm_i915_gem_object {
>   #define I915_BO_READONLY          BIT(7)
>   #define I915_TILING_QUIRK_BIT     8 /* unknown swizzling; do not release! */
>   #define I915_BO_PROTECTED         BIT(9)
> -#define I915_BO_WAS_BOUND_BIT     10
>   	/**
>   	 * @mem_flags - Mutable placement-related flags
>   	 *
> @@ -616,6 +615,8 @@ struct drm_i915_gem_object {
>   		 * pages were last acquired.
>   		 */
>   		bool dirty:1;
> +
> +		u32 tlb;
>   	} mm;
>   
>   	struct {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> index 6835279943df..8357dbdcab5c 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> @@ -191,6 +191,18 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr)
>   		vunmap(ptr);
>   }
>   
> +static void flush_tlb_invalidate(struct drm_i915_gem_object *obj)
> +{
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +	struct intel_gt *gt = to_gt(i915);
> +
> +	if (!obj->mm.tlb)
> +		return;
> +
> +	intel_gt_invalidate_tlb(gt, obj->mm.tlb);
> +	obj->mm.tlb = 0;
> +}
> +
>   struct sg_table *
>   __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   {
> @@ -216,14 +228,7 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   	__i915_gem_object_reset_page_iter(obj);
>   	obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
>   
> -	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
> -		struct drm_i915_private *i915 = to_i915(obj->base.dev);
> -		struct intel_gt *gt = to_gt(i915);
> -		intel_wakeref_t wakeref;
> -
> -		with_intel_gt_pm_if_awake(gt, wakeref)
> -			intel_gt_invalidate_tlbs(gt);
> -	}
> +	flush_tlb_invalidate(obj);
>   
>   	return pages;
>   }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 5c55a90672f4..f435e06125aa 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -38,8 +38,6 @@ static void __intel_gt_init_early(struct intel_gt *gt)
>   {
>   	spin_lock_init(&gt->irq_lock);
>   
> -	mutex_init(&gt->tlb_invalidate_lock);
> -
>   	INIT_LIST_HEAD(&gt->closed_vma);
>   	spin_lock_init(&gt->closed_lock);
>   
> @@ -50,6 +48,8 @@ static void __intel_gt_init_early(struct intel_gt *gt)
>   	intel_gt_init_reset(gt);
>   	intel_gt_init_requests(gt);
>   	intel_gt_init_timelines(gt);
> +	mutex_init(&gt->tlb.invalidate_lock);
> +	seqcount_mutex_init(&gt->tlb.seqno, &gt->tlb.invalidate_lock);
>   	intel_gt_pm_init_early(gt);
>   
>   	intel_uc_init_early(&gt->uc);
> @@ -770,6 +770,7 @@ void intel_gt_driver_late_release_all(struct drm_i915_private *i915)
>   		intel_gt_fini_requests(gt);
>   		intel_gt_fini_reset(gt);
>   		intel_gt_fini_timelines(gt);
> +		mutex_destroy(&gt->tlb.invalidate_lock);
>   		intel_engines_free(gt);
>   	}
>   }
> @@ -908,7 +909,7 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>   	return rb;
>   }
>   
> -void intel_gt_invalidate_tlbs(struct intel_gt *gt)
> +static void mmio_invalidate_full(struct intel_gt *gt)
>   {
>   	static const i915_reg_t gen8_regs[] = {
>   		[RENDER_CLASS]			= GEN8_RTCR,
> @@ -931,12 +932,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   	const i915_reg_t *regs;
>   	unsigned int num = 0;
>   
> -	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
> -		return;
> -
> -	if (intel_gt_is_wedged(gt))
> -		return;
> -
>   	if (GRAPHICS_VER(i915) == 12) {
>   		regs = gen12_regs;
>   		num = ARRAY_SIZE(gen12_regs);
> @@ -951,9 +946,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   			  "Platform does not implement TLB invalidation!"))
>   		return;
>   
> -	GEM_TRACE("\n");
> -
> -	mutex_lock(&gt->tlb_invalidate_lock);
>   	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>   
>   	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
> @@ -973,6 +965,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   		awake |= engine->mask;
>   	}
>   
> +	GT_TRACE(gt, "invalidated engines %08x\n", awake);
> +
>   	/* Wa_2207587034:tgl,dg1,rkl,adl-s,adl-p */
>   	if (awake &&
>   	    (IS_TIGERLAKE(i915) ||
> @@ -1012,5 +1006,38 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   	 * transitions.
>   	 */
>   	intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
> -	mutex_unlock(&gt->tlb_invalidate_lock);
> +}
> +
> +static bool tlb_seqno_passed(const struct intel_gt *gt, u32 seqno)
> +{
> +	u32 cur = intel_gt_tlb_seqno(gt);
> +
> +	/* Only skip if a *full* TLB invalidate barrier has passed */
> +	return (s32)(cur - ALIGN(seqno, 2)) > 0;
> +}
> +
> +void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
> +{
> +	intel_wakeref_t wakeref;
> +
> +	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
> +		return;
> +
> +	if (intel_gt_is_wedged(gt))
> +		return;
> +
> +	if (tlb_seqno_passed(gt, seqno))
> +		return;
> +
> +	with_intel_gt_pm_if_awake(gt, wakeref) {
> +		mutex_lock(&gt->tlb.invalidate_lock);
> +		if (tlb_seqno_passed(gt, seqno))
> +			goto unlock;
> +
> +		mmio_invalidate_full(gt);
> +
> +		write_seqcount_invalidate(&gt->tlb.seqno);
> +unlock:
> +		mutex_unlock(&gt->tlb.invalidate_lock);
> +	}
>   }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index 82d6f248d876..40b06adf509a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -101,6 +101,16 @@ void intel_gt_info_print(const struct intel_gt_info *info,
>   
>   void intel_gt_watchdog_work(struct work_struct *work);
>   
> -void intel_gt_invalidate_tlbs(struct intel_gt *gt);
> +static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt)
> +{
> +	return seqprop_sequence(&gt->tlb.seqno);
> +}
> +
> +static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt)
> +{
> +	return intel_gt_tlb_seqno(gt) | 1;
> +}
> +
> +void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno);
>   
>   #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index df708802889d..3804a583382b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -11,6 +11,7 @@
>   #include <linux/llist.h>
>   #include <linux/mutex.h>
>   #include <linux/notifier.h>
> +#include <linux/seqlock.h>
>   #include <linux/spinlock.h>
>   #include <linux/types.h>
>   #include <linux/workqueue.h>
> @@ -83,7 +84,22 @@ struct intel_gt {
>   	struct intel_uc uc;
>   	struct intel_gsc gsc;
>   
> -	struct mutex tlb_invalidate_lock;
> +	struct {
> +		/* Serialize global tlb invalidations */
> +		struct mutex invalidate_lock;
> +
> +		/*
> +		 * Batch TLB invalidations
> +		 *
> +		 * After unbinding the PTE, we need to ensure the TLB
> +		 * are invalidated prior to releasing the physical pages.
> +		 * But we only need one such invalidation for all unbinds,
> +		 * so we track how many TLB invalidations have been
> +		 * performed since unbind the PTE and only emit an extra
> +		 * invalidate if no full barrier has been passed.
> +		 */
> +		seqcount_mutex_t seqno;
> +	} tlb;
>   
>   	struct i915_wa_list wa_list;
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> index d8b94d638559..2da6c82a8bd2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm,
>   void ppgtt_unbind_vma(struct i915_address_space *vm,
>   		      struct i915_vma_resource *vma_res)
>   {
> -	if (vma_res->allocated)
> -		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> +	if (!vma_res->allocated)
> +		return;
> +
> +	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> +	if (vma_res->tlb)
> +		vma_invalidate_tlb(vm, *vma_res->tlb);

The patch is about more than batching? If there is a security hole in 
this area (unbind) with the current code?

Regards,

Tvrtko

>   }
>   
>   static unsigned long pd_count(u64 size, int shift)
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 646f419b2035..84a9ccbc5fc5 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -538,9 +538,6 @@ int i915_vma_bind(struct i915_vma *vma,
>   				   bind_flags);
>   	}
>   
> -	if (bind_flags & I915_VMA_LOCAL_BIND)
> -		set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
> -
>   	atomic_or(bind_flags, &vma->flags);
>   	return 0;
>   }
> @@ -1311,6 +1308,19 @@ I915_SELFTEST_EXPORT int i915_vma_get_pages(struct i915_vma *vma)
>   	return err;
>   }
>   
> +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb)
> +{
> +	/*
> +	 * Before we release the pages that were bound by this vma, we
> +	 * must invalidate all the TLBs that may still have a reference
> +	 * back to our physical address. It only needs to be done once,
> +	 * so after updating the PTE to point away from the pages, record
> +	 * the most recent TLB invalidation seqno, and if we have not yet
> +	 * flushed the TLBs upon release, perform a full invalidation.
> +	 */
> +	WRITE_ONCE(tlb, intel_gt_next_invalidate_tlb_full(vm->gt));
> +}
> +
>   static void __vma_put_pages(struct i915_vma *vma, unsigned int count)
>   {
>   	/* We allocate under vma_get_pages, so beware the shrinker */
> @@ -1942,7 +1952,12 @@ struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
>   		vma->vm->skip_pte_rewrite;
>   	trace_i915_vma_unbind(vma);
>   
> -	unbind_fence = i915_vma_resource_unbind(vma_res);
> +	if (async)
> +		unbind_fence = i915_vma_resource_unbind(vma_res,
> +							&vma->obj->mm.tlb);
> +	else
> +		unbind_fence = i915_vma_resource_unbind(vma_res, NULL);
> +
>   	vma->resource = NULL;
>   
>   	atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE),
> @@ -1950,10 +1965,13 @@ struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
>   
>   	i915_vma_detach(vma);
>   
> -	if (!async && unbind_fence) {
> -		dma_fence_wait(unbind_fence, false);
> -		dma_fence_put(unbind_fence);
> -		unbind_fence = NULL;
> +	if (!async) {
> +		if (unbind_fence) {
> +			dma_fence_wait(unbind_fence, false);
> +			dma_fence_put(unbind_fence);
> +			unbind_fence = NULL;
> +		}
> +		vma_invalidate_tlb(vma->vm, vma->obj->mm.tlb);
>   	}
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 88ca0bd9c900..5048eed536da 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -213,6 +213,7 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
>   			u64 size, u64 alignment, u64 flags);
>   void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
>   void i915_vma_revoke_mmap(struct i915_vma *vma);
> +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb);
>   struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async);
>   int __i915_vma_unbind(struct i915_vma *vma);
>   int __must_check i915_vma_unbind(struct i915_vma *vma);
> diff --git a/drivers/gpu/drm/i915/i915_vma_resource.c b/drivers/gpu/drm/i915/i915_vma_resource.c
> index 27c55027387a..5a67995ea5fe 100644
> --- a/drivers/gpu/drm/i915/i915_vma_resource.c
> +++ b/drivers/gpu/drm/i915/i915_vma_resource.c
> @@ -223,10 +223,13 @@ i915_vma_resource_fence_notify(struct i915_sw_fence *fence,
>    * Return: A refcounted pointer to a dma-fence that signals when unbinding is
>    * complete.
>    */
> -struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res)
> +struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res,
> +					   u32 *tlb)
>   {
>   	struct i915_address_space *vm = vma_res->vm;
>   
> +	vma_res->tlb = tlb;
> +
>   	/* Reference for the sw fence */
>   	i915_vma_resource_get(vma_res);
>   
> diff --git a/drivers/gpu/drm/i915/i915_vma_resource.h b/drivers/gpu/drm/i915/i915_vma_resource.h
> index 5d8427caa2ba..06923d1816e7 100644
> --- a/drivers/gpu/drm/i915/i915_vma_resource.h
> +++ b/drivers/gpu/drm/i915/i915_vma_resource.h
> @@ -67,6 +67,7 @@ struct i915_page_sizes {
>    * taken when the unbind is scheduled.
>    * @skip_pte_rewrite: During ggtt suspend and vm takedown pte rewriting
>    * needs to be skipped for unbind.
> + * @tlb: pointer for obj->mm.tlb, if async unbind. Otherwise, NULL
>    *
>    * The lifetime of a struct i915_vma_resource is from a binding request to
>    * the actual possible asynchronous unbind has completed.
> @@ -119,6 +120,8 @@ struct i915_vma_resource {
>   	bool immediate_unbind:1;
>   	bool needs_wakeref:1;
>   	bool skip_pte_rewrite:1;
> +
> +	u32 *tlb;
>   };
>   
>   bool i915_vma_resource_hold(struct i915_vma_resource *vma_res,
> @@ -131,7 +134,8 @@ struct i915_vma_resource *i915_vma_resource_alloc(void);
>   
>   void i915_vma_resource_free(struct i915_vma_resource *vma_res);
>   
> -struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res);
> +struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res,
> +					   u32 *tlb);
>   
>   void __i915_vma_resource_init(struct i915_vma_resource *vma_res);
>
Mauro Carvalho Chehab July 20, 2022, 7:13 a.m. UTC | #2
On Mon, 18 Jul 2022 14:52:05 +0100
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:

> 
> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
> > From: Chris Wilson <chris.p.wilson@intel.com>
> > 
> > Invalidate TLB in patch, in order to reduce performance regressions.
> 
> "in batches"?

Yeah. Will fix it.

> > diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> > index d8b94d638559..2da6c82a8bd2 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> > @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm,
> >   void ppgtt_unbind_vma(struct i915_address_space *vm,
> >   		      struct i915_vma_resource *vma_res)
> >   {
> > -	if (vma_res->allocated)
> > -		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> > +	if (!vma_res->allocated)
> > +		return;
> > +
> > +	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> > +	if (vma_res->tlb)
> > +		vma_invalidate_tlb(vm, *vma_res->tlb);
> 
> The patch is about more than batching? If there is a security hole in 
> this area (unbind) with the current code?

No, I don't think there's a security hole. The rationale for this is
not due to it.

Since commit 2f6b90da9192 ("drm/i915: Use vma resources for async unbinding"),
VMA unbind can happen either sync or async.

So, the logic needs to do TLB invalidate on two places. After this
patch, the code at __i915_vma_evict is:

	struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
	{
...
		if (async)
			unbind_fence = i915_vma_resource_unbind(vma_res,
								&vma->obj->mm.tlb);
		else
			unbind_fence = i915_vma_resource_unbind(vma_res, NULL);

		vma->resource = NULL;

		atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE),
			   &vma->flags);

		i915_vma_detach(vma);

		if (!async) {
			if (unbind_fence) {
				dma_fence_wait(unbind_fence, false);
				dma_fence_put(unbind_fence);
				unbind_fence = NULL;
			}
			vma_invalidate_tlb(vma->vm, vma->obj->mm.tlb);
		}
...

So, basically, if !async, __i915_vma_evict() will do TLB cache invalidation.

However, when async is used, the actual page release will happen later,
at this function:

	void ppgtt_unbind_vma(struct i915_address_space *vm,
			      struct i915_vma_resource *vma_res)
	{
		if (!vma_res->allocated)
			return;

		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
		if (vma_res->tlb)
			vma_invalidate_tlb(vm, *vma_res->tlb);
	}
	
Regards,
Mauro
Tvrtko Ursulin July 20, 2022, 10:49 a.m. UTC | #3
On 20/07/2022 08:13, Mauro Carvalho Chehab wrote:
> On Mon, 18 Jul 2022 14:52:05 +0100
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> 
>>
>> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
>>> From: Chris Wilson <chris.p.wilson@intel.com>
>>>
>>> Invalidate TLB in patch, in order to reduce performance regressions.
>>
>> "in batches"?
> 
> Yeah. Will fix it.
> 
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
>>> index d8b94d638559..2da6c82a8bd2 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
>>> @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm,
>>>    void ppgtt_unbind_vma(struct i915_address_space *vm,
>>>    		      struct i915_vma_resource *vma_res)
>>>    {
>>> -	if (vma_res->allocated)
>>> -		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
>>> +	if (!vma_res->allocated)
>>> +		return;
>>> +
>>> +	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
>>> +	if (vma_res->tlb)
>>> +		vma_invalidate_tlb(vm, *vma_res->tlb);
>>
>> The patch is about more than batching? If there is a security hole in
>> this area (unbind) with the current code?
> 
> No, I don't think there's a security hole. The rationale for this is
> not due to it.

In this case obvious question is why are these changes in the patch 
which declares itself to be about batching invalidations? Because...

> Since commit 2f6b90da9192 ("drm/i915: Use vma resources for async unbinding"),
> VMA unbind can happen either sync or async.
> 
> So, the logic needs to do TLB invalidate on two places. After this
> patch, the code at __i915_vma_evict is:
> 
> 	struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
> 	{
> ...
> 		if (async)
> 			unbind_fence = i915_vma_resource_unbind(vma_res,
> 								&vma->obj->mm.tlb);
> 		else
> 			unbind_fence = i915_vma_resource_unbind(vma_res, NULL);
> 
> 		vma->resource = NULL;
> 
> 		atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE),
> 			   &vma->flags);
> 
> 		i915_vma_detach(vma);
> 
> 		if (!async) {
> 			if (unbind_fence) {
> 				dma_fence_wait(unbind_fence, false);
> 				dma_fence_put(unbind_fence);
> 				unbind_fence = NULL;
> 			}
> 			vma_invalidate_tlb(vma->vm, vma->obj->mm.tlb);
> 		}
> ...
> 
> So, basically, if !async, __i915_vma_evict() will do TLB cache invalidation.
> 
> However, when async is used, the actual page release will happen later,
> at this function:
> 
> 	void ppgtt_unbind_vma(struct i915_address_space *vm,
> 			      struct i915_vma_resource *vma_res)
> 	{
> 		if (!vma_res->allocated)
> 			return;
> 
> 		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> 		if (vma_res->tlb)
> 			vma_invalidate_tlb(vm, *vma_res->tlb);
> 	}

.. frankly I don't follow since I don't see any page release happening 
in here. Just PTE clearing.

I am explaining why it looks to me that the patch is doing two things. 
Implementing batching _and_ adding invalidation points at VMA unbind 
sites, while so far we had it at backing store release only. Maybe I am 
wrong and perhaps I am too slow to pick up on the explanation here.

So if the patch is doing two things please split it up.

I am further confused by the invalidation call site in evict and in 
unbind - why there can't be one logical site since the logical sequence 
is evict -> unbind.

Regards,

Tvrtko
Tvrtko Ursulin July 20, 2022, 10:54 a.m. UTC | #4
On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> Invalidate TLB in patch, in order to reduce performance regressions.
> 
> Currently, every caller performs a full barrier around a TLB
> invalidation, ignoring all other invalidations that may have already
> removed their PTEs from the cache. As this is a synchronous operation
> and can be quite slow, we cause multiple threads to contend on the TLB
> invalidate mutex blocking userspace.
> 
> We only need to invalidate the TLB once after replacing our PTE to
> ensure that there is no possible continued access to the physical
> address before releasing our pages. By tracking a seqno for each full
> TLB invalidate we can quickly determine if one has been performed since
> rewriting the PTE, and only if necessary trigger one for ourselves.
> 
> That helps to reduce the performance regression introduced by TLB
> invalidate logic.
> 
> [mchehab: rebased to not require moving the code to a separate file]
> 
> Cc: stable@vger.kernel.org
> Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Cc: Fei Yang <fei.yang@intel.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
> ---
> 
> To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
> See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/
> 
>   .../gpu/drm/i915/gem/i915_gem_object_types.h  |  3 +-
>   drivers/gpu/drm/i915/gem/i915_gem_pages.c     | 21 +++++---
>   drivers/gpu/drm/i915/gt/intel_gt.c            | 53 ++++++++++++++-----
>   drivers/gpu/drm/i915/gt/intel_gt.h            | 12 ++++-
>   drivers/gpu/drm/i915/gt/intel_gt_types.h      | 18 ++++++-
>   drivers/gpu/drm/i915/gt/intel_ppgtt.c         |  8 ++-
>   drivers/gpu/drm/i915/i915_vma.c               | 34 +++++++++---
>   drivers/gpu/drm/i915/i915_vma.h               |  1 +
>   drivers/gpu/drm/i915/i915_vma_resource.c      |  5 +-
>   drivers/gpu/drm/i915/i915_vma_resource.h      |  6 ++-
>   10 files changed, 125 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> index 5cf36a130061..9f6b14ec189a 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> @@ -335,7 +335,6 @@ struct drm_i915_gem_object {
>   #define I915_BO_READONLY          BIT(7)
>   #define I915_TILING_QUIRK_BIT     8 /* unknown swizzling; do not release! */
>   #define I915_BO_PROTECTED         BIT(9)
> -#define I915_BO_WAS_BOUND_BIT     10
>   	/**
>   	 * @mem_flags - Mutable placement-related flags
>   	 *
> @@ -616,6 +615,8 @@ struct drm_i915_gem_object {
>   		 * pages were last acquired.
>   		 */
>   		bool dirty:1;
> +
> +		u32 tlb;
>   	} mm;
>   
>   	struct {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> index 6835279943df..8357dbdcab5c 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> @@ -191,6 +191,18 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr)
>   		vunmap(ptr);
>   }
>   
> +static void flush_tlb_invalidate(struct drm_i915_gem_object *obj)
> +{
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +	struct intel_gt *gt = to_gt(i915);
> +
> +	if (!obj->mm.tlb)
> +		return;
> +
> +	intel_gt_invalidate_tlb(gt, obj->mm.tlb);
> +	obj->mm.tlb = 0;
> +}
> +
>   struct sg_table *
>   __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   {
> @@ -216,14 +228,7 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   	__i915_gem_object_reset_page_iter(obj);
>   	obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
>   
> -	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
> -		struct drm_i915_private *i915 = to_i915(obj->base.dev);
> -		struct intel_gt *gt = to_gt(i915);
> -		intel_wakeref_t wakeref;
> -
> -		with_intel_gt_pm_if_awake(gt, wakeref)
> -			intel_gt_invalidate_tlbs(gt);
> -	}
> +	flush_tlb_invalidate(obj);
>   
>   	return pages;
>   }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 5c55a90672f4..f435e06125aa 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -38,8 +38,6 @@ static void __intel_gt_init_early(struct intel_gt *gt)
>   {
>   	spin_lock_init(&gt->irq_lock);
>   
> -	mutex_init(&gt->tlb_invalidate_lock);
> -
>   	INIT_LIST_HEAD(&gt->closed_vma);
>   	spin_lock_init(&gt->closed_lock);
>   
> @@ -50,6 +48,8 @@ static void __intel_gt_init_early(struct intel_gt *gt)
>   	intel_gt_init_reset(gt);
>   	intel_gt_init_requests(gt);
>   	intel_gt_init_timelines(gt);
> +	mutex_init(&gt->tlb.invalidate_lock);
> +	seqcount_mutex_init(&gt->tlb.seqno, &gt->tlb.invalidate_lock);
>   	intel_gt_pm_init_early(gt);
>   
>   	intel_uc_init_early(&gt->uc);
> @@ -770,6 +770,7 @@ void intel_gt_driver_late_release_all(struct drm_i915_private *i915)
>   		intel_gt_fini_requests(gt);
>   		intel_gt_fini_reset(gt);
>   		intel_gt_fini_timelines(gt);
> +		mutex_destroy(&gt->tlb.invalidate_lock);
>   		intel_engines_free(gt);
>   	}
>   }
> @@ -908,7 +909,7 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>   	return rb;
>   }
>   
> -void intel_gt_invalidate_tlbs(struct intel_gt *gt)
> +static void mmio_invalidate_full(struct intel_gt *gt)
>   {
>   	static const i915_reg_t gen8_regs[] = {
>   		[RENDER_CLASS]			= GEN8_RTCR,
> @@ -931,12 +932,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   	const i915_reg_t *regs;
>   	unsigned int num = 0;
>   
> -	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
> -		return;
> -
> -	if (intel_gt_is_wedged(gt))
> -		return;
> -
>   	if (GRAPHICS_VER(i915) == 12) {
>   		regs = gen12_regs;
>   		num = ARRAY_SIZE(gen12_regs);
> @@ -951,9 +946,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   			  "Platform does not implement TLB invalidation!"))
>   		return;
>   
> -	GEM_TRACE("\n");
> -
> -	mutex_lock(&gt->tlb_invalidate_lock);
>   	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>   
>   	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
> @@ -973,6 +965,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   		awake |= engine->mask;
>   	}
>   
> +	GT_TRACE(gt, "invalidated engines %08x\n", awake);
> +
>   	/* Wa_2207587034:tgl,dg1,rkl,adl-s,adl-p */
>   	if (awake &&
>   	    (IS_TIGERLAKE(i915) ||
> @@ -1012,5 +1006,38 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   	 * transitions.
>   	 */
>   	intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
> -	mutex_unlock(&gt->tlb_invalidate_lock);
> +}
> +
> +static bool tlb_seqno_passed(const struct intel_gt *gt, u32 seqno)
> +{
> +	u32 cur = intel_gt_tlb_seqno(gt);
> +
> +	/* Only skip if a *full* TLB invalidate barrier has passed */
> +	return (s32)(cur - ALIGN(seqno, 2)) > 0;
> +}
> +
> +void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
> +{
> +	intel_wakeref_t wakeref;
> +
> +	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
> +		return;
> +
> +	if (intel_gt_is_wedged(gt))
> +		return;
> +
> +	if (tlb_seqno_passed(gt, seqno))
> +		return;
> +
> +	with_intel_gt_pm_if_awake(gt, wakeref) {
> +		mutex_lock(&gt->tlb.invalidate_lock);
> +		if (tlb_seqno_passed(gt, seqno))
> +			goto unlock;
> +
> +		mmio_invalidate_full(gt);
> +
> +		write_seqcount_invalidate(&gt->tlb.seqno);
> +unlock:
> +		mutex_unlock(&gt->tlb.invalidate_lock);
> +	}
>   }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index 82d6f248d876..40b06adf509a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -101,6 +101,16 @@ void intel_gt_info_print(const struct intel_gt_info *info,
>   
>   void intel_gt_watchdog_work(struct work_struct *work);
>   
> -void intel_gt_invalidate_tlbs(struct intel_gt *gt);
> +static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt)
> +{
> +	return seqprop_sequence(&gt->tlb.seqno);
> +}
> +
> +static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt)
> +{
> +	return intel_gt_tlb_seqno(gt) | 1;
> +}
> +
> +void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno);
>   
>   #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index df708802889d..3804a583382b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -11,6 +11,7 @@
>   #include <linux/llist.h>
>   #include <linux/mutex.h>
>   #include <linux/notifier.h>
> +#include <linux/seqlock.h>
>   #include <linux/spinlock.h>
>   #include <linux/types.h>
>   #include <linux/workqueue.h>
> @@ -83,7 +84,22 @@ struct intel_gt {
>   	struct intel_uc uc;
>   	struct intel_gsc gsc;
>   
> -	struct mutex tlb_invalidate_lock;
> +	struct {
> +		/* Serialize global tlb invalidations */
> +		struct mutex invalidate_lock;
> +
> +		/*
> +		 * Batch TLB invalidations
> +		 *
> +		 * After unbinding the PTE, we need to ensure the TLB
> +		 * are invalidated prior to releasing the physical pages.
> +		 * But we only need one such invalidation for all unbinds,
> +		 * so we track how many TLB invalidations have been
> +		 * performed since unbind the PTE and only emit an extra
> +		 * invalidate if no full barrier has been passed.
> +		 */
> +		seqcount_mutex_t seqno;
> +	} tlb;
>   
>   	struct i915_wa_list wa_list;
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> index d8b94d638559..2da6c82a8bd2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm,
>   void ppgtt_unbind_vma(struct i915_address_space *vm,
>   		      struct i915_vma_resource *vma_res)
>   {
> -	if (vma_res->allocated)
> -		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> +	if (!vma_res->allocated)
> +		return;
> +
> +	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> +	if (vma_res->tlb)
> +		vma_invalidate_tlb(vm, *vma_res->tlb);
>   }
>   
>   static unsigned long pd_count(u64 size, int shift)
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 646f419b2035..84a9ccbc5fc5 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -538,9 +538,6 @@ int i915_vma_bind(struct i915_vma *vma,
>   				   bind_flags);
>   	}
>   
> -	if (bind_flags & I915_VMA_LOCAL_BIND)
> -		set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
> -
>   	atomic_or(bind_flags, &vma->flags);
>   	return 0;
>   }
> @@ -1311,6 +1308,19 @@ I915_SELFTEST_EXPORT int i915_vma_get_pages(struct i915_vma *vma)
>   	return err;
>   }
>   
> +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb)
> +{
> +	/*
> +	 * Before we release the pages that were bound by this vma, we
> +	 * must invalidate all the TLBs that may still have a reference
> +	 * back to our physical address. It only needs to be done once,
> +	 * so after updating the PTE to point away from the pages, record
> +	 * the most recent TLB invalidation seqno, and if we have not yet
> +	 * flushed the TLBs upon release, perform a full invalidation.
> +	 */
> +	WRITE_ONCE(tlb, intel_gt_next_invalidate_tlb_full(vm->gt));

Shouldn't tlb be a pointer for this to make sense?

Regards,

Tvrtko

> +}
> +
>   static void __vma_put_pages(struct i915_vma *vma, unsigned int count)
>   {
>   	/* We allocate under vma_get_pages, so beware the shrinker */
> @@ -1942,7 +1952,12 @@ struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
>   		vma->vm->skip_pte_rewrite;
>   	trace_i915_vma_unbind(vma);
>   
> -	unbind_fence = i915_vma_resource_unbind(vma_res);
> +	if (async)
> +		unbind_fence = i915_vma_resource_unbind(vma_res,
> +							&vma->obj->mm.tlb);
> +	else
> +		unbind_fence = i915_vma_resource_unbind(vma_res, NULL);
> +
>   	vma->resource = NULL;
>   
>   	atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE),
> @@ -1950,10 +1965,13 @@ struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
>   
>   	i915_vma_detach(vma);
>   
> -	if (!async && unbind_fence) {
> -		dma_fence_wait(unbind_fence, false);
> -		dma_fence_put(unbind_fence);
> -		unbind_fence = NULL;
> +	if (!async) {
> +		if (unbind_fence) {
> +			dma_fence_wait(unbind_fence, false);
> +			dma_fence_put(unbind_fence);
> +			unbind_fence = NULL;
> +		}
> +		vma_invalidate_tlb(vma->vm, vma->obj->mm.tlb);
>   	}
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 88ca0bd9c900..5048eed536da 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -213,6 +213,7 @@ bool i915_vma_misplaced(const struct i915_vma *vma,
>   			u64 size, u64 alignment, u64 flags);
>   void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
>   void i915_vma_revoke_mmap(struct i915_vma *vma);
> +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb);
>   struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async);
>   int __i915_vma_unbind(struct i915_vma *vma);
>   int __must_check i915_vma_unbind(struct i915_vma *vma);
> diff --git a/drivers/gpu/drm/i915/i915_vma_resource.c b/drivers/gpu/drm/i915/i915_vma_resource.c
> index 27c55027387a..5a67995ea5fe 100644
> --- a/drivers/gpu/drm/i915/i915_vma_resource.c
> +++ b/drivers/gpu/drm/i915/i915_vma_resource.c
> @@ -223,10 +223,13 @@ i915_vma_resource_fence_notify(struct i915_sw_fence *fence,
>    * Return: A refcounted pointer to a dma-fence that signals when unbinding is
>    * complete.
>    */
> -struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res)
> +struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res,
> +					   u32 *tlb)
>   {
>   	struct i915_address_space *vm = vma_res->vm;
>   
> +	vma_res->tlb = tlb;
> +
>   	/* Reference for the sw fence */
>   	i915_vma_resource_get(vma_res);
>   
> diff --git a/drivers/gpu/drm/i915/i915_vma_resource.h b/drivers/gpu/drm/i915/i915_vma_resource.h
> index 5d8427caa2ba..06923d1816e7 100644
> --- a/drivers/gpu/drm/i915/i915_vma_resource.h
> +++ b/drivers/gpu/drm/i915/i915_vma_resource.h
> @@ -67,6 +67,7 @@ struct i915_page_sizes {
>    * taken when the unbind is scheduled.
>    * @skip_pte_rewrite: During ggtt suspend and vm takedown pte rewriting
>    * needs to be skipped for unbind.
> + * @tlb: pointer for obj->mm.tlb, if async unbind. Otherwise, NULL
>    *
>    * The lifetime of a struct i915_vma_resource is from a binding request to
>    * the actual possible asynchronous unbind has completed.
> @@ -119,6 +120,8 @@ struct i915_vma_resource {
>   	bool immediate_unbind:1;
>   	bool needs_wakeref:1;
>   	bool skip_pte_rewrite:1;
> +
> +	u32 *tlb;
>   };
>   
>   bool i915_vma_resource_hold(struct i915_vma_resource *vma_res,
> @@ -131,7 +134,8 @@ struct i915_vma_resource *i915_vma_resource_alloc(void);
>   
>   void i915_vma_resource_free(struct i915_vma_resource *vma_res);
>   
> -struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res);
> +struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res,
> +					   u32 *tlb);
>   
>   void __i915_vma_resource_init(struct i915_vma_resource *vma_res);
>
Mauro Carvalho Chehab July 27, 2022, 11:48 a.m. UTC | #5
On Wed, 20 Jul 2022 11:49:59 +0100
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:

> On 20/07/2022 08:13, Mauro Carvalho Chehab wrote:
> > On Mon, 18 Jul 2022 14:52:05 +0100
> > Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> >   
> >>
> >> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:  
> >>> From: Chris Wilson <chris.p.wilson@intel.com>
> >>>
> >>> Invalidate TLB in patch, in order to reduce performance regressions.  
> >>
> >> "in batches"?  
> > 
> > Yeah. Will fix it.

> > +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb)
> > +{
> > +	/*
> > +	 * Before we release the pages that were bound by this vma, we
> > +	 * must invalidate all the TLBs that may still have a reference
> > +	 * back to our physical address. It only needs to be done once,
> > +	 * so after updating the PTE to point away from the pages, record
> > +	 * the most recent TLB invalidation seqno, and if we have not yet
> > +	 * flushed the TLBs upon release, perform a full invalidation.
> > +	 */
> > +	WRITE_ONCE(tlb, intel_gt_next_invalidate_tlb_full(vm->gt));  
> 
> Shouldn't tlb be a pointer for this to make sense?

Oh, my mistake! Will fix at the next version.

> >   
> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> >>> index d8b94d638559..2da6c82a8bd2 100644
> >>> --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> >>> +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
> >>> @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm,
> >>>    void ppgtt_unbind_vma(struct i915_address_space *vm,
> >>>    		      struct i915_vma_resource *vma_res)
> >>>    {
> >>> -	if (vma_res->allocated)
> >>> -		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> >>> +	if (!vma_res->allocated)
> >>> +		return;
> >>> +
> >>> +	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
> >>> +	if (vma_res->tlb)
> >>> +		vma_invalidate_tlb(vm, *vma_res->tlb);  
> >>
> >> The patch is about more than batching? If there is a security hole in
> >> this area (unbind) with the current code?  
> > 
> > No, I don't think there's a security hole. The rationale for this is
> > not due to it.  
> 
> In this case obvious question is why are these changes in the patch 
> which declares itself to be about batching invalidations? Because...

Because vma_invalidate_tlb() basically stores a TLB seqno, but the
actual invalidation is deferred to when the pages are unset, at
__i915_gem_object_unset_pages().

So, what happens is:

- on VMA sync mode, the need to invalidate TLB is marked at
  __vma_put_pages(), before VMA unbind;
- on async, this is deferred to happen at ppgtt_unbind_vma(), where
  it marks the need to invalidate TLBs.

On both cases, __i915_gem_object_unset_pages() is called later,
when the driver is ready to unmap the page.

> I am explaining why it looks to me that the patch is doing two things. 
> Implementing batching _and_ adding invalidation points at VMA unbind 
> sites, while so far we had it at backing store release only. Maybe I am 
> wrong and perhaps I am too slow to pick up on the explanation here.
> 
> So if the patch is doing two things please split it up.
> 
> I am further confused by the invalidation call site in evict and in 
> unbind - why there can't be one logical site since the logical sequence 
> is evict -> unbind.

The invalidation happens only on one place: __i915_gem_object_unset_pages().

Despite its name, vma_invalidate_tlb() just marks the need of doing TLB
invalidation.

Regards,
Mauro
Tvrtko Ursulin July 27, 2022, 12:56 p.m. UTC | #6
On 27/07/2022 12:48, Mauro Carvalho Chehab wrote:
> On Wed, 20 Jul 2022 11:49:59 +0100
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> 
>> On 20/07/2022 08:13, Mauro Carvalho Chehab wrote:
>>> On Mon, 18 Jul 2022 14:52:05 +0100
>>> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
>>>    
>>>>
>>>> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
>>>>> From: Chris Wilson <chris.p.wilson@intel.com>
>>>>>
>>>>> Invalidate TLB in patch, in order to reduce performance regressions.
>>>>
>>>> "in batches"?
>>>
>>> Yeah. Will fix it.
> 
>>> +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb)
>>> +{
>>> +	/*
>>> +	 * Before we release the pages that were bound by this vma, we
>>> +	 * must invalidate all the TLBs that may still have a reference
>>> +	 * back to our physical address. It only needs to be done once,
>>> +	 * so after updating the PTE to point away from the pages, record
>>> +	 * the most recent TLB invalidation seqno, and if we have not yet
>>> +	 * flushed the TLBs upon release, perform a full invalidation.
>>> +	 */
>>> +	WRITE_ONCE(tlb, intel_gt_next_invalidate_tlb_full(vm->gt));
>>
>> Shouldn't tlb be a pointer for this to make sense?
> 
> Oh, my mistake! Will fix at the next version.
> 
>>>    
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
>>>>> index d8b94d638559..2da6c82a8bd2 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
>>>>> @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm,
>>>>>     void ppgtt_unbind_vma(struct i915_address_space *vm,
>>>>>     		      struct i915_vma_resource *vma_res)
>>>>>     {
>>>>> -	if (vma_res->allocated)
>>>>> -		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
>>>>> +	if (!vma_res->allocated)
>>>>> +		return;
>>>>> +
>>>>> +	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
>>>>> +	if (vma_res->tlb)
>>>>> +		vma_invalidate_tlb(vm, *vma_res->tlb);
>>>>
>>>> The patch is about more than batching? If there is a security hole in
>>>> this area (unbind) with the current code?
>>>
>>> No, I don't think there's a security hole. The rationale for this is
>>> not due to it.
>>
>> In this case obvious question is why are these changes in the patch
>> which declares itself to be about batching invalidations? Because...
> 
> Because vma_invalidate_tlb() basically stores a TLB seqno, but the
> actual invalidation is deferred to when the pages are unset, at
> __i915_gem_object_unset_pages().
> 
> So, what happens is:
> 
> - on VMA sync mode, the need to invalidate TLB is marked at
>    __vma_put_pages(), before VMA unbind;
> - on async, this is deferred to happen at ppgtt_unbind_vma(), where
>    it marks the need to invalidate TLBs.
> 
> On both cases, __i915_gem_object_unset_pages() is called later,
> when the driver is ready to unmap the page.

Sorry still not clear to me why is the patch moving marking of the need 
to invalidate (regardless if it a bit like today, or a seqno like in 
this patch) from bind to unbind?

What if the seqno was stored in i915_vma_bind, where the bit is set 
today, and all the hunks which touch the unbind and evict would 
disappear from the patch. What wouldn't work in that case, if anything?

Regards,

Tvrtko

> 
>> I am explaining why it looks to me that the patch is doing two things.
>> Implementing batching _and_ adding invalidation points at VMA unbind
>> sites, while so far we had it at backing store release only. Maybe I am
>> wrong and perhaps I am too slow to pick up on the explanation here.
>>
>> So if the patch is doing two things please split it up.
>>
>> I am further confused by the invalidation call site in evict and in
>> unbind - why there can't be one logical site since the logical sequence
>> is evict -> unbind.
> 
> The invalidation happens only on one place: __i915_gem_object_unset_pages().
> 
> Despite its name, vma_invalidate_tlb() just marks the need of doing TLB
> invalidation.
> 
> Regards,
> Mauro
Mauro Carvalho Chehab July 28, 2022, 6:32 a.m. UTC | #7
On Wed, 27 Jul 2022 13:56:50 +0100
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:

> > Because vma_invalidate_tlb() basically stores a TLB seqno, but the
> > actual invalidation is deferred to when the pages are unset, at
> > __i915_gem_object_unset_pages().
> > 
> > So, what happens is:
> > 
> > - on VMA sync mode, the need to invalidate TLB is marked at
> >    __vma_put_pages(), before VMA unbind;
> > - on async, this is deferred to happen at ppgtt_unbind_vma(), where
> >    it marks the need to invalidate TLBs.
> > 
> > On both cases, __i915_gem_object_unset_pages() is called later,
> > when the driver is ready to unmap the page.  
> 
> Sorry still not clear to me why is the patch moving marking of the need 
> to invalidate (regardless if it a bit like today, or a seqno like in 
> this patch) from bind to unbind?
> 
> What if the seqno was stored in i915_vma_bind, where the bit is set 
> today, and all the hunks which touch the unbind and evict would 
> disappear from the patch. What wouldn't work in that case, if anything?

Ah, now I see your point.

I can't see any sense on having a sequence number at VMA bind, as the
unbind order can be different. The need of doing a full TLB invalidation
or not depends on the unbind order.

The way the current algorithm works is that drm_i915_gem_object can be
created on any order, and, at unbind/evict, they receive a seqno.

The seqno is incremented at intel_gt_invalidate_tlb():

    void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
    {
	with_intel_gt_pm_if_awake(gt, wakeref) {
		mutex_lock(&gt->tlb.invalidate_lock);
		if (tlb_seqno_passed(gt, seqno))
				goto unlock;

		mmio_invalidate_full(gt);

		write_seqcount_invalidate(&gt->tlb.seqno);	// increment seqno
		

So, let's say 3 objects were created, on this order:

	obj1
	obj2
	obj3

They would be unbind/evict on a different order. On that time, 
the mm.tlb will be stamped with a seqno, using the number from the
last TLB flush, plus 1.

As different threads can be used to handle TLB flushes, let's imagine
two threads (just for the sake of having an example). On such case,
what we would have is:

seqno		Thread 0			Thread 1

seqno=2		unbind/evict event
		obj3.mm.tlb = seqno | 1
seqno=2		unbind/evict event
		obj1.mm.tlb = seqno | 1
						__i915_gem_object_unset_pages() 
						called for obj3, TLB flush happened,
						invalidating both obj1 and obj2.
						seqno += 2					
seqno=4		unbind/evict event
		obj1.mm.tlb = seqno | 1
						__i915_gem_object_unset_pages()
						called for obj1, don't flush.
...
						__i915_gem_object_unset_pages() called for obj2, TLB flush happened
						seqno += 2
seqno=6

So, basically the seqno is used to track when the object data stopped
being updated, because of an unbind/evict event, being later used by
intel_gt_invalidate_tlb() when called from __i915_gem_object_unset_pages(),
in order to check if a previous invalidation call was enough to invalidate
the object, or if a new call is needed.

Now, if seqno is stored at bind, data can still leak, as the assumption
made by intel_gt_invalidate_tlb() that the data stopped being used at
seqno is not true anymore.

Still, I agree that this logic is complex and should be better 
documented. So, if you're now OK with this patch, I'll add the above
explanation inside a kernel-doc comment.

Regards,
Mauro
Mauro Carvalho Chehab July 28, 2022, 7:26 a.m. UTC | #8
On Thu, 28 Jul 2022 08:32:32 +0200
Mauro Carvalho Chehab <mauro.chehab@linux.intel.com> wrote:

> On Wed, 27 Jul 2022 13:56:50 +0100
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> 
> > > Because vma_invalidate_tlb() basically stores a TLB seqno, but the
> > > actual invalidation is deferred to when the pages are unset, at
> > > __i915_gem_object_unset_pages().
> > > 
> > > So, what happens is:
> > > 
> > > - on VMA sync mode, the need to invalidate TLB is marked at
> > >    __vma_put_pages(), before VMA unbind;
> > > - on async, this is deferred to happen at ppgtt_unbind_vma(), where
> > >    it marks the need to invalidate TLBs.
> > > 
> > > On both cases, __i915_gem_object_unset_pages() is called later,
> > > when the driver is ready to unmap the page.    
> > 
> > Sorry still not clear to me why is the patch moving marking of the need 
> > to invalidate (regardless if it a bit like today, or a seqno like in 
> > this patch) from bind to unbind?
> > 
> > What if the seqno was stored in i915_vma_bind, where the bit is set 
> > today, and all the hunks which touch the unbind and evict would 
> > disappear from the patch. What wouldn't work in that case, if anything?  
> 
> Ah, now I see your point.
> 
> I can't see any sense on having a sequence number at VMA bind, as the
> unbind order can be different. The need of doing a full TLB invalidation
> or not depends on the unbind order.
> 
> The way the current algorithm works is that drm_i915_gem_object can be
> created on any order, and, at unbind/evict, they receive a seqno.
> 
> The seqno is incremented at intel_gt_invalidate_tlb():
> 
>     void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
>     {
> 	with_intel_gt_pm_if_awake(gt, wakeref) {
> 		mutex_lock(&gt->tlb.invalidate_lock);
> 		if (tlb_seqno_passed(gt, seqno))
> 				goto unlock;
> 
> 		mmio_invalidate_full(gt);
> 
> 		write_seqcount_invalidate(&gt->tlb.seqno);	// increment seqno
> 		
> 
> So, let's say 3 objects were created, on this order:
> 
> 	obj1
> 	obj2
> 	obj3
> 
> They would be unbind/evict on a different order. On that time, 
> the mm.tlb will be stamped with a seqno, using the number from the
> last TLB flush, plus 1.
> 
> As different threads can be used to handle TLB flushes, let's imagine
> two threads (just for the sake of having an example). On such case,
> what we would have is:
> 
> seqno		Thread 0			Thread 1
> 
> seqno=2		unbind/evict event
> 		obj3.mm.tlb = seqno | 1
> seqno=2		unbind/evict event
> 		obj1.mm.tlb = seqno | 1
> 						__i915_gem_object_unset_pages() 
> 						called for obj3, TLB flush happened,
> 						invalidating both obj1 and obj2.
> 						seqno += 2					
> seqno=4		unbind/evict event
> 		obj1.mm.tlb = seqno | 1

cut-and-paste typo. it should be, instead:

 		obj2.mm.tlb = seqno | 1


> 						__i915_gem_object_unset_pages()
> 						called for obj1, don't flush.
> ...
> 						__i915_gem_object_unset_pages() called for obj2, TLB flush happened
> 						seqno += 2
> seqno=6
> 
> So, basically the seqno is used to track when the object data stopped
> being updated, because of an unbind/evict event, being later used by
> intel_gt_invalidate_tlb() when called from __i915_gem_object_unset_pages(),
> in order to check if a previous invalidation call was enough to invalidate
> the object, or if a new call is needed.
> 
> Now, if seqno is stored at bind, data can still leak, as the assumption
> made by intel_gt_invalidate_tlb() that the data stopped being used at
> seqno is not true anymore.
> 
> Still, I agree that this logic is complex and should be better 
> documented. So, if you're now OK with this patch, I'll add the above
> explanation inside a kernel-doc comment.

I'm enclosing the kernel-doc patch (to be applied after moving the code into
its own files: intel_tlb.c/intel_tlb.h):

[PATCH] drm/i915/gt: document TLB cache invalidation functions

Add a description for the TLB cache invalidation algorithm and for
the related kAPI functions.

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>

diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.c b/drivers/gpu/drm/i915/gt/intel_tlb.c
index af8cae979489..8eda0743da74 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.c
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.c
@@ -145,6 +145,18 @@ static void mmio_invalidate_full(struct intel_gt *gt)
 	intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
 }
 
+/**
+ * intel_gt_invalidate_tlb_full - do full TLB cache invalidation
+ * @gt: GT structure
+ * @seqno: sequence number
+ *
+ * Do a full TLB cache invalidation if the @seqno is bigger than the last
+ * full TLB cache invalidation.
+ *
+ * Note:
+ * The TLB cache invalidation logic depends on GEN-specific registers.
+ * It currently supports GEN8 to GEN12 and GuC-based TLB cache invalidation.
+ */
 void intel_gt_invalidate_tlb_full(struct intel_gt *gt, u32 seqno)
 {
 	intel_wakeref_t wakeref;
@@ -177,6 +189,12 @@ void intel_gt_init_tlb(struct intel_gt *gt)
 	seqcount_mutex_init(&gt->tlb.seqno, &gt->tlb.invalidate_lock);
 }
 
+/**
+ * intel_gt_fini_tlb - initialize TLB-specific vars
+ * @gt: GT structure
+ *
+ * Frees any resources needed by TLB cache invalidation logic.
+ */
 void intel_gt_fini_tlb(struct intel_gt *gt)
 {
 	mutex_destroy(&gt->tlb.invalidate_lock);
diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.h b/drivers/gpu/drm/i915/gt/intel_tlb.h
index 46ce25bf5afe..d186f5d5901f 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.h
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.h
@@ -11,16 +11,99 @@
 
 #include "intel_gt_types.h"
 
+/**
+ * DOC: TLB cache invalidation logic
+ *
+ * The way the current algorithm works is that drm_i915_gem_object can be
+ * created on any order. At unbind/evict time, the object is warranted that
+ * it won't be used anymore. So, they store a sequence number provided by
+ * intel_gt_next_invalidate_tlb_full().This can happen either at
+ * __vma_put_pages(), for VMA sync unbind, or at ppgtt_unbind_vma(), for
+ * VMA async VMA bind.
+ *
+ * At __i915_gem_object_unset_pages(), intel_gt_invalidate_tlb() is called,
+ * where it checks if the sequence number of the object was already invalidated
+ * or not. If not, it increments it::
+ *
+ *   void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
+ *   {
+ *   ...
+ * 	with_intel_gt_pm_if_awake(gt, wakeref) {
+ * 		mutex_lock(&gt->tlb.invalidate_lock);
+ * 		if (tlb_seqno_passed(gt, seqno))
+ * 				goto unlock;
+ *
+ * 		mmio_invalidate_full(gt);
+ *
+ * 		write_seqcount_invalidate(&gt->tlb.seqno); // increment seqno
+ *    ...
+ *
+ * So, let's say the current seqno is 2 and 3 new objects were created,
+ * on this order:
+ *
+ * 	obj1
+ * 	obj2
+ * 	obj3
+ *
+ * They can be unbind/evict on a different order. At unbind/evict time,
+ * the mm.tlb will be stamped with the sequence number, using the number
+ * from the last TLB flush, plus 1.
+ *
+ * Different threads may be used on unbind/evict and/or unset pages.
+ *
+ * As the logic at void intel_gt_invalidate_tlb() is protected by a mutex,
+ * for simplicity, let's consider just two threads::
+ *
+ *   sequence number	Thread 0		Thread 1
+ *
+ *   seqno=2
+ *			unbind/evict event
+ * 			obj3.mm.tlb = seqno | 1
+ *
+ *			unbind/evict event
+ * 			obj1.mm.tlb = seqno | 1
+ * 						__i915_gem_object_unset_pages()
+ * 						called for obj3 => TLB flush
+ * 						invalidating both obj1 and obj2.
+ * 						seqno += 2
+ *   seqno=4
+ *			unbind/evict event
+ * 			obj2.mm.tlb = seqno | 1
+ * 						__i915_gem_object_unset_pages()
+ * 						called for obj1, don't flush,
+ *						as past flush invalidated obj1
+ *
+ * 						__i915_gem_object_unset_pages()
+ *						called for obj2 => TLB flush
+ * 						seqno += 2
+ *   seqno=6
+ */
+
 void intel_gt_invalidate_tlb_full(struct intel_gt *gt, u32 seqno);
 
 void intel_gt_init_tlb(struct intel_gt *gt);
 void intel_gt_fini_tlb(struct intel_gt *gt);
 
+/**
+ * intel_gt_tlb_seqno - Returns the current TLB invlidation sequence number
+ *
+ * @gt: GT structure
+ *
+ * There's no need to lock while calling it, as seqprop_sequence is thread-safe
+ */
 static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt)
 {
 	return seqprop_sequence(&gt->tlb.seqno);
 }
 
+/**
+ * intel_gt_next_invalidate_tlb_full - Returns the next TLB full invalidation
+ *	sequence number
+ *
+ * @gt: GT structure
+ *
+ * There's no need to lock while calling it, as seqprop_sequence is thread-safe
+ */
 static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt)
 {
 	return intel_gt_tlb_seqno(gt) | 1;
Tvrtko Ursulin July 28, 2022, 10:11 a.m. UTC | #9
On 28/07/2022 07:32, Mauro Carvalho Chehab wrote:
> On Wed, 27 Jul 2022 13:56:50 +0100
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> 
>>> Because vma_invalidate_tlb() basically stores a TLB seqno, but the
>>> actual invalidation is deferred to when the pages are unset, at
>>> __i915_gem_object_unset_pages().
>>>
>>> So, what happens is:
>>>
>>> - on VMA sync mode, the need to invalidate TLB is marked at
>>>     __vma_put_pages(), before VMA unbind;
>>> - on async, this is deferred to happen at ppgtt_unbind_vma(), where
>>>     it marks the need to invalidate TLBs.
>>>
>>> On both cases, __i915_gem_object_unset_pages() is called later,
>>> when the driver is ready to unmap the page.
>>
>> Sorry still not clear to me why is the patch moving marking of the need
>> to invalidate (regardless if it a bit like today, or a seqno like in
>> this patch) from bind to unbind?
>>
>> What if the seqno was stored in i915_vma_bind, where the bit is set
>> today, and all the hunks which touch the unbind and evict would
>> disappear from the patch. What wouldn't work in that case, if anything?
> 
> Ah, now I see your point.
> 
> I can't see any sense on having a sequence number at VMA bind, as the
> unbind order can be different. The need of doing a full TLB invalidation
> or not depends on the unbind order.

Sorry yes that was stupid from me.. What I was really thinking was the 
approach I initially used for coalescing. Keeping the set_bit in bind 
and then once the code enters intel_gt_invalidate_tlbs, takes a "ticket" 
and waits on the mutex. Once it gets the mutex checks the ticket against 
the GT copy and if two invalidations have passed since it was waiting on 
the mutex it can immediately exit. That would seem like a minimal 
improvement to batch things up.

But I guess it would still emit needless invalidations if there is no 
contention, just a stream of serialized put pages. While the approach 
from this patch can skip all but truly required.

Okay, go for it and thanks for the explanations.

Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

P.S. The last remaining "ugliness" is the 2nd call to invalidation from 
evict. It would be nicer if there was a single common place to do it on 
vma unbind but okay, I do not plan to dig into it so fine.

> 
> The way the current algorithm works is that drm_i915_gem_object can be
> created on any order, and, at unbind/evict, they receive a seqno.
> 
> The seqno is incremented at intel_gt_invalidate_tlb():
> 
>      void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
>      {
> 	with_intel_gt_pm_if_awake(gt, wakeref) {
> 		mutex_lock(&gt->tlb.invalidate_lock);
> 		if (tlb_seqno_passed(gt, seqno))
> 				goto unlock;
> 
> 		mmio_invalidate_full(gt);
> 
> 		write_seqcount_invalidate(&gt->tlb.seqno);	// increment seqno
> 		
> 
> So, let's say 3 objects were created, on this order:
> 
> 	obj1
> 	obj2
> 	obj3
> 
> They would be unbind/evict on a different order. On that time,
> the mm.tlb will be stamped with a seqno, using the number from the
> last TLB flush, plus 1.
> 
> As different threads can be used to handle TLB flushes, let's imagine
> two threads (just for the sake of having an example). On such case,
> what we would have is:
> 
> seqno		Thread 0			Thread 1
> 
> seqno=2		unbind/evict event
> 		obj3.mm.tlb = seqno | 1
> seqno=2		unbind/evict event
> 		obj1.mm.tlb = seqno | 1
> 						__i915_gem_object_unset_pages()
> 						called for obj3, TLB flush happened,
> 						invalidating both obj1 and obj2.
> 						seqno += 2					
> seqno=4		unbind/evict event
> 		obj1.mm.tlb = seqno | 1
> 						__i915_gem_object_unset_pages()
> 						called for obj1, don't flush.
> ...
> 						__i915_gem_object_unset_pages() called for obj2, TLB flush happened
> 						seqno += 2
> seqno=6
> 
> So, basically the seqno is used to track when the object data stopped
> being updated, because of an unbind/evict event, being later used by
> intel_gt_invalidate_tlb() when called from __i915_gem_object_unset_pages(),
> in order to check if a previous invalidation call was enough to invalidate
> the object, or if a new call is needed.
> 
> Now, if seqno is stored at bind, data can still leak, as the assumption
> made by intel_gt_invalidate_tlb() that the data stopped being used at
> seqno is not true anymore.
> 
> Still, I agree that this logic is complex and should be better
> documented. So, if you're now OK with this patch, I'll add the above
> explanation inside a kernel-doc comment.
> 
> Regards,
> Mauro
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 5cf36a130061..9f6b14ec189a 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -335,7 +335,6 @@  struct drm_i915_gem_object {
 #define I915_BO_READONLY          BIT(7)
 #define I915_TILING_QUIRK_BIT     8 /* unknown swizzling; do not release! */
 #define I915_BO_PROTECTED         BIT(9)
-#define I915_BO_WAS_BOUND_BIT     10
 	/**
 	 * @mem_flags - Mutable placement-related flags
 	 *
@@ -616,6 +615,8 @@  struct drm_i915_gem_object {
 		 * pages were last acquired.
 		 */
 		bool dirty:1;
+
+		u32 tlb;
 	} mm;
 
 	struct {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 6835279943df..8357dbdcab5c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -191,6 +191,18 @@  static void unmap_object(struct drm_i915_gem_object *obj, void *ptr)
 		vunmap(ptr);
 }
 
+static void flush_tlb_invalidate(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	struct intel_gt *gt = to_gt(i915);
+
+	if (!obj->mm.tlb)
+		return;
+
+	intel_gt_invalidate_tlb(gt, obj->mm.tlb);
+	obj->mm.tlb = 0;
+}
+
 struct sg_table *
 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
 {
@@ -216,14 +228,7 @@  __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
 	__i915_gem_object_reset_page_iter(obj);
 	obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
 
-	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
-		struct drm_i915_private *i915 = to_i915(obj->base.dev);
-		struct intel_gt *gt = to_gt(i915);
-		intel_wakeref_t wakeref;
-
-		with_intel_gt_pm_if_awake(gt, wakeref)
-			intel_gt_invalidate_tlbs(gt);
-	}
+	flush_tlb_invalidate(obj);
 
 	return pages;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index 5c55a90672f4..f435e06125aa 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -38,8 +38,6 @@  static void __intel_gt_init_early(struct intel_gt *gt)
 {
 	spin_lock_init(&gt->irq_lock);
 
-	mutex_init(&gt->tlb_invalidate_lock);
-
 	INIT_LIST_HEAD(&gt->closed_vma);
 	spin_lock_init(&gt->closed_lock);
 
@@ -50,6 +48,8 @@  static void __intel_gt_init_early(struct intel_gt *gt)
 	intel_gt_init_reset(gt);
 	intel_gt_init_requests(gt);
 	intel_gt_init_timelines(gt);
+	mutex_init(&gt->tlb.invalidate_lock);
+	seqcount_mutex_init(&gt->tlb.seqno, &gt->tlb.invalidate_lock);
 	intel_gt_pm_init_early(gt);
 
 	intel_uc_init_early(&gt->uc);
@@ -770,6 +770,7 @@  void intel_gt_driver_late_release_all(struct drm_i915_private *i915)
 		intel_gt_fini_requests(gt);
 		intel_gt_fini_reset(gt);
 		intel_gt_fini_timelines(gt);
+		mutex_destroy(&gt->tlb.invalidate_lock);
 		intel_engines_free(gt);
 	}
 }
@@ -908,7 +909,7 @@  get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
 	return rb;
 }
 
-void intel_gt_invalidate_tlbs(struct intel_gt *gt)
+static void mmio_invalidate_full(struct intel_gt *gt)
 {
 	static const i915_reg_t gen8_regs[] = {
 		[RENDER_CLASS]			= GEN8_RTCR,
@@ -931,12 +932,6 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 	const i915_reg_t *regs;
 	unsigned int num = 0;
 
-	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
-		return;
-
-	if (intel_gt_is_wedged(gt))
-		return;
-
 	if (GRAPHICS_VER(i915) == 12) {
 		regs = gen12_regs;
 		num = ARRAY_SIZE(gen12_regs);
@@ -951,9 +946,6 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 			  "Platform does not implement TLB invalidation!"))
 		return;
 
-	GEM_TRACE("\n");
-
-	mutex_lock(&gt->tlb_invalidate_lock);
 	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
 
 	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
@@ -973,6 +965,8 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 		awake |= engine->mask;
 	}
 
+	GT_TRACE(gt, "invalidated engines %08x\n", awake);
+
 	/* Wa_2207587034:tgl,dg1,rkl,adl-s,adl-p */
 	if (awake &&
 	    (IS_TIGERLAKE(i915) ||
@@ -1012,5 +1006,38 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 	 * transitions.
 	 */
 	intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
-	mutex_unlock(&gt->tlb_invalidate_lock);
+}
+
+static bool tlb_seqno_passed(const struct intel_gt *gt, u32 seqno)
+{
+	u32 cur = intel_gt_tlb_seqno(gt);
+
+	/* Only skip if a *full* TLB invalidate barrier has passed */
+	return (s32)(cur - ALIGN(seqno, 2)) > 0;
+}
+
+void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
+{
+	intel_wakeref_t wakeref;
+
+	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
+		return;
+
+	if (intel_gt_is_wedged(gt))
+		return;
+
+	if (tlb_seqno_passed(gt, seqno))
+		return;
+
+	with_intel_gt_pm_if_awake(gt, wakeref) {
+		mutex_lock(&gt->tlb.invalidate_lock);
+		if (tlb_seqno_passed(gt, seqno))
+			goto unlock;
+
+		mmio_invalidate_full(gt);
+
+		write_seqcount_invalidate(&gt->tlb.seqno);
+unlock:
+		mutex_unlock(&gt->tlb.invalidate_lock);
+	}
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
index 82d6f248d876..40b06adf509a 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -101,6 +101,16 @@  void intel_gt_info_print(const struct intel_gt_info *info,
 
 void intel_gt_watchdog_work(struct work_struct *work);
 
-void intel_gt_invalidate_tlbs(struct intel_gt *gt);
+static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt)
+{
+	return seqprop_sequence(&gt->tlb.seqno);
+}
+
+static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt)
+{
+	return intel_gt_tlb_seqno(gt) | 1;
+}
+
+void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno);
 
 #endif /* __INTEL_GT_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index df708802889d..3804a583382b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -11,6 +11,7 @@ 
 #include <linux/llist.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
+#include <linux/seqlock.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
@@ -83,7 +84,22 @@  struct intel_gt {
 	struct intel_uc uc;
 	struct intel_gsc gsc;
 
-	struct mutex tlb_invalidate_lock;
+	struct {
+		/* Serialize global tlb invalidations */
+		struct mutex invalidate_lock;
+
+		/*
+		 * Batch TLB invalidations
+		 *
+		 * After unbinding the PTE, we need to ensure the TLB
+		 * are invalidated prior to releasing the physical pages.
+		 * But we only need one such invalidation for all unbinds,
+		 * so we track how many TLB invalidations have been
+		 * performed since unbind the PTE and only emit an extra
+		 * invalidate if no full barrier has been passed.
+		 */
+		seqcount_mutex_t seqno;
+	} tlb;
 
 	struct i915_wa_list wa_list;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
index d8b94d638559..2da6c82a8bd2 100644
--- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
@@ -206,8 +206,12 @@  void ppgtt_bind_vma(struct i915_address_space *vm,
 void ppgtt_unbind_vma(struct i915_address_space *vm,
 		      struct i915_vma_resource *vma_res)
 {
-	if (vma_res->allocated)
-		vm->clear_range(vm, vma_res->start, vma_res->vma_size);
+	if (!vma_res->allocated)
+		return;
+
+	vm->clear_range(vm, vma_res->start, vma_res->vma_size);
+	if (vma_res->tlb)
+		vma_invalidate_tlb(vm, *vma_res->tlb);
 }
 
 static unsigned long pd_count(u64 size, int shift)
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 646f419b2035..84a9ccbc5fc5 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -538,9 +538,6 @@  int i915_vma_bind(struct i915_vma *vma,
 				   bind_flags);
 	}
 
-	if (bind_flags & I915_VMA_LOCAL_BIND)
-		set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
-
 	atomic_or(bind_flags, &vma->flags);
 	return 0;
 }
@@ -1311,6 +1308,19 @@  I915_SELFTEST_EXPORT int i915_vma_get_pages(struct i915_vma *vma)
 	return err;
 }
 
+void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb)
+{
+	/*
+	 * Before we release the pages that were bound by this vma, we
+	 * must invalidate all the TLBs that may still have a reference
+	 * back to our physical address. It only needs to be done once,
+	 * so after updating the PTE to point away from the pages, record
+	 * the most recent TLB invalidation seqno, and if we have not yet
+	 * flushed the TLBs upon release, perform a full invalidation.
+	 */
+	WRITE_ONCE(tlb, intel_gt_next_invalidate_tlb_full(vm->gt));
+}
+
 static void __vma_put_pages(struct i915_vma *vma, unsigned int count)
 {
 	/* We allocate under vma_get_pages, so beware the shrinker */
@@ -1942,7 +1952,12 @@  struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
 		vma->vm->skip_pte_rewrite;
 	trace_i915_vma_unbind(vma);
 
-	unbind_fence = i915_vma_resource_unbind(vma_res);
+	if (async)
+		unbind_fence = i915_vma_resource_unbind(vma_res,
+							&vma->obj->mm.tlb);
+	else
+		unbind_fence = i915_vma_resource_unbind(vma_res, NULL);
+
 	vma->resource = NULL;
 
 	atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE),
@@ -1950,10 +1965,13 @@  struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async)
 
 	i915_vma_detach(vma);
 
-	if (!async && unbind_fence) {
-		dma_fence_wait(unbind_fence, false);
-		dma_fence_put(unbind_fence);
-		unbind_fence = NULL;
+	if (!async) {
+		if (unbind_fence) {
+			dma_fence_wait(unbind_fence, false);
+			dma_fence_put(unbind_fence);
+			unbind_fence = NULL;
+		}
+		vma_invalidate_tlb(vma->vm, vma->obj->mm.tlb);
 	}
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 88ca0bd9c900..5048eed536da 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -213,6 +213,7 @@  bool i915_vma_misplaced(const struct i915_vma *vma,
 			u64 size, u64 alignment, u64 flags);
 void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
 void i915_vma_revoke_mmap(struct i915_vma *vma);
+void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb);
 struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async);
 int __i915_vma_unbind(struct i915_vma *vma);
 int __must_check i915_vma_unbind(struct i915_vma *vma);
diff --git a/drivers/gpu/drm/i915/i915_vma_resource.c b/drivers/gpu/drm/i915/i915_vma_resource.c
index 27c55027387a..5a67995ea5fe 100644
--- a/drivers/gpu/drm/i915/i915_vma_resource.c
+++ b/drivers/gpu/drm/i915/i915_vma_resource.c
@@ -223,10 +223,13 @@  i915_vma_resource_fence_notify(struct i915_sw_fence *fence,
  * Return: A refcounted pointer to a dma-fence that signals when unbinding is
  * complete.
  */
-struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res)
+struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res,
+					   u32 *tlb)
 {
 	struct i915_address_space *vm = vma_res->vm;
 
+	vma_res->tlb = tlb;
+
 	/* Reference for the sw fence */
 	i915_vma_resource_get(vma_res);
 
diff --git a/drivers/gpu/drm/i915/i915_vma_resource.h b/drivers/gpu/drm/i915/i915_vma_resource.h
index 5d8427caa2ba..06923d1816e7 100644
--- a/drivers/gpu/drm/i915/i915_vma_resource.h
+++ b/drivers/gpu/drm/i915/i915_vma_resource.h
@@ -67,6 +67,7 @@  struct i915_page_sizes {
  * taken when the unbind is scheduled.
  * @skip_pte_rewrite: During ggtt suspend and vm takedown pte rewriting
  * needs to be skipped for unbind.
+ * @tlb: pointer for obj->mm.tlb, if async unbind. Otherwise, NULL
  *
  * The lifetime of a struct i915_vma_resource is from a binding request to
  * the actual possible asynchronous unbind has completed.
@@ -119,6 +120,8 @@  struct i915_vma_resource {
 	bool immediate_unbind:1;
 	bool needs_wakeref:1;
 	bool skip_pte_rewrite:1;
+
+	u32 *tlb;
 };
 
 bool i915_vma_resource_hold(struct i915_vma_resource *vma_res,
@@ -131,7 +134,8 @@  struct i915_vma_resource *i915_vma_resource_alloc(void);
 
 void i915_vma_resource_free(struct i915_vma_resource *vma_res);
 
-struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res);
+struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res,
+					   u32 *tlb);
 
 void __i915_vma_resource_init(struct i915_vma_resource *vma_res);