diff mbox series

[1/7] drm/i915/gtt: Defer address space cleanup to an RCU worker

Message ID 20190619112341.9082-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/7] drm/i915/gtt: Defer address space cleanup to an RCU worker | expand

Commit Message

Chris Wilson June 19, 2019, 11:23 a.m. UTC
Enable RCU protection of i915_address_space and its ppgtt superclasses,
and defer its cleanup into a worker executed after an RCU grace period.

In the future we will be able to use the RCU protection to reduce the
locking around VM lookups, but the immediate benefit is being able to
defer the release into a kworker (process context). This is required as
we may need to sleep to reap the WC pages stashed away inside the ppgtt.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110934
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile.header-test     |   1 +
 drivers/gpu/drm/i915/i915_gem_gtt.c           | 109 ++++++++++--------
 drivers/gpu/drm/i915/i915_gem_gtt.h           |   5 +
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   2 -
 4 files changed, 66 insertions(+), 51 deletions(-)

Comments

Tvrtko Ursulin June 19, 2019, 2:51 p.m. UTC | #1
On 19/06/2019 12:23, Chris Wilson wrote:
> Enable RCU protection of i915_address_space and its ppgtt superclasses,
> and defer its cleanup into a worker executed after an RCU grace period.
> 
> In the future we will be able to use the RCU protection to reduce the
> locking around VM lookups, but the immediate benefit is being able to
> defer the release into a kworker (process context). This is required as
> we may need to sleep to reap the WC pages stashed away inside the ppgtt.

I can't see that it adds RCU protection apart from using queue_rcu_work 
for some reason. It _seems_ like it could just as well use normal 
deferred worker? RCU may have a benefit of being relaxed in timing ie we 
don't care it happens immediately, but also don't want to put some made 
up time period against it. So it's all about natural batching only at 
this point?

> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110934
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile.header-test     |   1 +
>   drivers/gpu/drm/i915/i915_gem_gtt.c           | 109 ++++++++++--------
>   drivers/gpu/drm/i915/i915_gem_gtt.h           |   5 +
>   drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   2 -
>   4 files changed, 66 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/Makefile.header-test b/drivers/gpu/drm/i915/Makefile.header-test
> index e6ba66f787f9..cb74242f9c3b 100644
> --- a/drivers/gpu/drm/i915/Makefile.header-test
> +++ b/drivers/gpu/drm/i915/Makefile.header-test
> @@ -6,6 +6,7 @@ header_test := \
>   	i915_active_types.h \
>   	i915_debugfs.h \
>   	i915_drv.h \
> +	i915_gem_gtt.h \
>   	i915_irq.h \
>   	i915_params.h \
>   	i915_priolist_types.h \
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 8ab820145ea6..d9eddbd79670 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -482,9 +482,69 @@ static void vm_free_page(struct i915_address_space *vm, struct page *page)
>   	spin_unlock(&vm->free_pages.lock);
>   }
>   
> +static void i915_address_space_fini(struct i915_address_space *vm)
> +{
> +	spin_lock(&vm->free_pages.lock);
> +	if (pagevec_count(&vm->free_pages.pvec))
> +		vm_free_pages_release(vm, true);
> +	GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec));
> +	spin_unlock(&vm->free_pages.lock);
> +
> +	drm_mm_takedown(&vm->mm);
> +
> +	mutex_destroy(&vm->mutex);
> +}
> +
> +static void ppgtt_destroy_vma(struct i915_address_space *vm)
> +{
> +	struct list_head *phases[] = {
> +		&vm->bound_list,
> +		&vm->unbound_list,
> +		NULL,
> +	}, **phase;
> +
> +	mutex_lock(&vm->i915->drm.struct_mutex);
> +	for (phase = phases; *phase; phase++) {
> +		struct i915_vma *vma, *vn;
> +
> +		list_for_each_entry_safe(vma, vn, *phase, vm_link)

No one can access the lists at this point, right? So apart from 
inefficiency, strictly pedantic thing would be to keep taking and 
releasing struct_mutext for i915_vma_destroy only I think. It's find to 
optimize that, but maybe add a comment above mutex_lock.

> +			i915_vma_destroy(vma);
> +	}
> +	mutex_unlock(&vm->i915->drm.struct_mutex);
> +}
> +
> +static void __i915_vm_release(struct work_struct *work)
> +{
> +	struct i915_address_space *vm =
> +		container_of(work, struct i915_address_space, rcu.work);
> +
> +	ppgtt_destroy_vma(vm);
> +
> +	GEM_BUG_ON(!list_empty(&vm->bound_list));
> +	GEM_BUG_ON(!list_empty(&vm->unbound_list));
> +
> +	vm->cleanup(vm);
> +	i915_address_space_fini(vm);
> +
> +	kfree(vm);
> +}
> +
> +void i915_vm_release(struct kref *kref)
> +{
> +	struct i915_address_space *vm =
> +		container_of(kref, struct i915_address_space, ref);
> +
> +	GEM_BUG_ON(i915_is_ggtt(vm));
> +	trace_i915_ppgtt_release(vm);
> +
> +	vm->closed = true;

This is now outside any locking although I am not sure if it matters. 
Feels weird at least.

> +	queue_rcu_work(system_unbound_wq, &vm->rcu);
> +}
> +
>   static void i915_address_space_init(struct i915_address_space *vm, int subclass)
>   {
>   	kref_init(&vm->ref);
> +	INIT_RCU_WORK(&vm->rcu, __i915_vm_release);
>   
>   	/*
>   	 * The vm->mutex must be reclaim safe (for use in the shrinker).
> @@ -505,19 +565,6 @@ static void i915_address_space_init(struct i915_address_space *vm, int subclass)
>   	INIT_LIST_HEAD(&vm->bound_list);
>   }
>   
> -static void i915_address_space_fini(struct i915_address_space *vm)
> -{
> -	spin_lock(&vm->free_pages.lock);
> -	if (pagevec_count(&vm->free_pages.pvec))
> -		vm_free_pages_release(vm, true);
> -	GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec));
> -	spin_unlock(&vm->free_pages.lock);
> -
> -	drm_mm_takedown(&vm->mm);
> -
> -	mutex_destroy(&vm->mutex);
> -}
> -
>   static int __setup_page_dma(struct i915_address_space *vm,
>   			    struct i915_page_dma *p,
>   			    gfp_t gfp)
> @@ -2250,42 +2297,6 @@ i915_ppgtt_create(struct drm_i915_private *i915)
>   	return ppgtt;
>   }
>   
> -static void ppgtt_destroy_vma(struct i915_address_space *vm)
> -{
> -	struct list_head *phases[] = {
> -		&vm->bound_list,
> -		&vm->unbound_list,
> -		NULL,
> -	}, **phase;
> -
> -	vm->closed = true;
> -	for (phase = phases; *phase; phase++) {
> -		struct i915_vma *vma, *vn;
> -
> -		list_for_each_entry_safe(vma, vn, *phase, vm_link)
> -			i915_vma_destroy(vma);
> -	}
> -}
> -
> -void i915_vm_release(struct kref *kref)
> -{
> -	struct i915_address_space *vm =
> -		container_of(kref, struct i915_address_space, ref);
> -
> -	GEM_BUG_ON(i915_is_ggtt(vm));
> -	trace_i915_ppgtt_release(vm);
> -
> -	ppgtt_destroy_vma(vm);
> -
> -	GEM_BUG_ON(!list_empty(&vm->bound_list));
> -	GEM_BUG_ON(!list_empty(&vm->unbound_list));
> -
> -	vm->cleanup(vm);
> -	i915_address_space_fini(vm);
> -
> -	kfree(vm);
> -}
> -
>   /* Certain Gen5 chipsets require require idling the GPU before
>    * unmapping anything from the GTT when VT-d is enabled.
>    */
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 812717ccc69b..8de57f67a911 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -35,8 +35,12 @@
>   #define __I915_GEM_GTT_H__
>   
>   #include <linux/io-mapping.h>
> +#include <linux/kref.h>
>   #include <linux/mm.h>
>   #include <linux/pagevec.h>
> +#include <linux/workqueue.h>
> +
> +#include <drm/drm_mm.h>
>   
>   #include "gt/intel_reset.h"
>   #include "i915_gem_fence_reg.h"
> @@ -280,6 +284,7 @@ struct pagestash {
>   
>   struct i915_address_space {
>   	struct kref ref;
> +	struct rcu_work rcu;
>   
>   	struct drm_mm mm;
>   	struct drm_i915_private *i915;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 1a60b9fe8221..0c47276ed5df 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -208,9 +208,7 @@ static int igt_ppgtt_alloc(void *arg)
>   	}
>   
>   err_ppgtt_cleanup:
> -	mutex_lock(&dev_priv->drm.struct_mutex);
>   	i915_vm_put(&ppgtt->vm);
> -	mutex_unlock(&dev_priv->drm.struct_mutex);
>   	return err;
>   }
>   
> 

Regards,

Tvrtko
Chris Wilson June 19, 2019, 2:56 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-06-19 15:51:18)
> 
> On 19/06/2019 12:23, Chris Wilson wrote:
> > Enable RCU protection of i915_address_space and its ppgtt superclasses,
> > and defer its cleanup into a worker executed after an RCU grace period.
> > 
> > In the future we will be able to use the RCU protection to reduce the
> > locking around VM lookups, but the immediate benefit is being able to
> > defer the release into a kworker (process context). This is required as
> > we may need to sleep to reap the WC pages stashed away inside the ppgtt.
> 
> I can't see that it adds RCU protection apart from using queue_rcu_work 
> for some reason.

That provides the RCU safe freeing, yes. My intentional is to use the
rcu read lock around the vm lookup + kref when dropping the struct_mutex
there.

> It _seems_ like it could just as well use normal 
> deferred worker? RCU may have a benefit of being relaxed in timing ie we 
> don't care it happens immediately, but also don't want to put some made 
> up time period against it. So it's all about natural batching only at 
> this point?

At this moment, to fix up the current bug and to allow i915_active to be
struct_mutex-less, we need to defer the i915_vm_release and
gen8_ppgtt_release to process context. Hence using the worker callback
and not the regular softirq-context rcu callback.
-Chris
Tvrtko Ursulin June 20, 2019, 4:51 p.m. UTC | #3
On 19/06/2019 15:56, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-06-19 15:51:18)
>>
>> On 19/06/2019 12:23, Chris Wilson wrote:
>>> Enable RCU protection of i915_address_space and its ppgtt superclasses,
>>> and defer its cleanup into a worker executed after an RCU grace period.
>>>
>>> In the future we will be able to use the RCU protection to reduce the
>>> locking around VM lookups, but the immediate benefit is being able to
>>> defer the release into a kworker (process context). This is required as
>>> we may need to sleep to reap the WC pages stashed away inside the ppgtt.
>>
>> I can't see that it adds RCU protection apart from using queue_rcu_work
>> for some reason.
> 
> That provides the RCU safe freeing, yes. My intentional is to use the
> rcu read lock around the vm lookup + kref when dropping the struct_mutex
> there.
> 
>> It _seems_ like it could just as well use normal
>> deferred worker? RCU may have a benefit of being relaxed in timing ie we
>> don't care it happens immediately, but also don't want to put some made
>> up time period against it. So it's all about natural batching only at
>> this point?
> 
> At this moment, to fix up the current bug and to allow i915_active to be
> struct_mutex-less, we need to defer the i915_vm_release and
> gen8_ppgtt_release to process context. Hence using the worker callback
> and not the regular softirq-context rcu callback.

It's fine by me, I just wanted for some reassurances about future plans. :)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Makefile.header-test b/drivers/gpu/drm/i915/Makefile.header-test
index e6ba66f787f9..cb74242f9c3b 100644
--- a/drivers/gpu/drm/i915/Makefile.header-test
+++ b/drivers/gpu/drm/i915/Makefile.header-test
@@ -6,6 +6,7 @@  header_test := \
 	i915_active_types.h \
 	i915_debugfs.h \
 	i915_drv.h \
+	i915_gem_gtt.h \
 	i915_irq.h \
 	i915_params.h \
 	i915_priolist_types.h \
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 8ab820145ea6..d9eddbd79670 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -482,9 +482,69 @@  static void vm_free_page(struct i915_address_space *vm, struct page *page)
 	spin_unlock(&vm->free_pages.lock);
 }
 
+static void i915_address_space_fini(struct i915_address_space *vm)
+{
+	spin_lock(&vm->free_pages.lock);
+	if (pagevec_count(&vm->free_pages.pvec))
+		vm_free_pages_release(vm, true);
+	GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec));
+	spin_unlock(&vm->free_pages.lock);
+
+	drm_mm_takedown(&vm->mm);
+
+	mutex_destroy(&vm->mutex);
+}
+
+static void ppgtt_destroy_vma(struct i915_address_space *vm)
+{
+	struct list_head *phases[] = {
+		&vm->bound_list,
+		&vm->unbound_list,
+		NULL,
+	}, **phase;
+
+	mutex_lock(&vm->i915->drm.struct_mutex);
+	for (phase = phases; *phase; phase++) {
+		struct i915_vma *vma, *vn;
+
+		list_for_each_entry_safe(vma, vn, *phase, vm_link)
+			i915_vma_destroy(vma);
+	}
+	mutex_unlock(&vm->i915->drm.struct_mutex);
+}
+
+static void __i915_vm_release(struct work_struct *work)
+{
+	struct i915_address_space *vm =
+		container_of(work, struct i915_address_space, rcu.work);
+
+	ppgtt_destroy_vma(vm);
+
+	GEM_BUG_ON(!list_empty(&vm->bound_list));
+	GEM_BUG_ON(!list_empty(&vm->unbound_list));
+
+	vm->cleanup(vm);
+	i915_address_space_fini(vm);
+
+	kfree(vm);
+}
+
+void i915_vm_release(struct kref *kref)
+{
+	struct i915_address_space *vm =
+		container_of(kref, struct i915_address_space, ref);
+
+	GEM_BUG_ON(i915_is_ggtt(vm));
+	trace_i915_ppgtt_release(vm);
+
+	vm->closed = true;
+	queue_rcu_work(system_unbound_wq, &vm->rcu);
+}
+
 static void i915_address_space_init(struct i915_address_space *vm, int subclass)
 {
 	kref_init(&vm->ref);
+	INIT_RCU_WORK(&vm->rcu, __i915_vm_release);
 
 	/*
 	 * The vm->mutex must be reclaim safe (for use in the shrinker).
@@ -505,19 +565,6 @@  static void i915_address_space_init(struct i915_address_space *vm, int subclass)
 	INIT_LIST_HEAD(&vm->bound_list);
 }
 
-static void i915_address_space_fini(struct i915_address_space *vm)
-{
-	spin_lock(&vm->free_pages.lock);
-	if (pagevec_count(&vm->free_pages.pvec))
-		vm_free_pages_release(vm, true);
-	GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec));
-	spin_unlock(&vm->free_pages.lock);
-
-	drm_mm_takedown(&vm->mm);
-
-	mutex_destroy(&vm->mutex);
-}
-
 static int __setup_page_dma(struct i915_address_space *vm,
 			    struct i915_page_dma *p,
 			    gfp_t gfp)
@@ -2250,42 +2297,6 @@  i915_ppgtt_create(struct drm_i915_private *i915)
 	return ppgtt;
 }
 
-static void ppgtt_destroy_vma(struct i915_address_space *vm)
-{
-	struct list_head *phases[] = {
-		&vm->bound_list,
-		&vm->unbound_list,
-		NULL,
-	}, **phase;
-
-	vm->closed = true;
-	for (phase = phases; *phase; phase++) {
-		struct i915_vma *vma, *vn;
-
-		list_for_each_entry_safe(vma, vn, *phase, vm_link)
-			i915_vma_destroy(vma);
-	}
-}
-
-void i915_vm_release(struct kref *kref)
-{
-	struct i915_address_space *vm =
-		container_of(kref, struct i915_address_space, ref);
-
-	GEM_BUG_ON(i915_is_ggtt(vm));
-	trace_i915_ppgtt_release(vm);
-
-	ppgtt_destroy_vma(vm);
-
-	GEM_BUG_ON(!list_empty(&vm->bound_list));
-	GEM_BUG_ON(!list_empty(&vm->unbound_list));
-
-	vm->cleanup(vm);
-	i915_address_space_fini(vm);
-
-	kfree(vm);
-}
-
 /* Certain Gen5 chipsets require require idling the GPU before
  * unmapping anything from the GTT when VT-d is enabled.
  */
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 812717ccc69b..8de57f67a911 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -35,8 +35,12 @@ 
 #define __I915_GEM_GTT_H__
 
 #include <linux/io-mapping.h>
+#include <linux/kref.h>
 #include <linux/mm.h>
 #include <linux/pagevec.h>
+#include <linux/workqueue.h>
+
+#include <drm/drm_mm.h>
 
 #include "gt/intel_reset.h"
 #include "i915_gem_fence_reg.h"
@@ -280,6 +284,7 @@  struct pagestash {
 
 struct i915_address_space {
 	struct kref ref;
+	struct rcu_work rcu;
 
 	struct drm_mm mm;
 	struct drm_i915_private *i915;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 1a60b9fe8221..0c47276ed5df 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -208,9 +208,7 @@  static int igt_ppgtt_alloc(void *arg)
 	}
 
 err_ppgtt_cleanup:
-	mutex_lock(&dev_priv->drm.struct_mutex);
 	i915_vm_put(&ppgtt->vm);
-	mutex_unlock(&dev_priv->drm.struct_mutex);
 	return err;
 }