drm/i915: Borrow our struct_mutex for the direct reclaim

Message ID	1349881739-32277-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Wed, 10 Oct 2012 16:08:59 +0100 Message-Id: <1349881739-32277-1-git-send-email-chris@chris-wilson.co.uk> In-Reply-To: <1349867250-22808-1-git-send-email-chris@chris-wilson.co.uk> References: <1349867250-22808-1-git-send-email-chris@chris-wilson.co.uk> Subject: [Intel-gfx] [PATCH] drm/i915: Borrow our struct_mutex for the direct reclaim Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org Errors-To: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org

Message ID

1349881739-32277-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Wed, 10 Oct 2012 16:08:59 +0100
Message-Id: <1349881739-32277-1-git-send-email-chris@chris-wilson.co.uk>
In-Reply-To: <1349867250-22808-1-git-send-email-chris@chris-wilson.co.uk>
References: <1349867250-22808-1-git-send-email-chris@chris-wilson.co.uk>
Subject: [Intel-gfx] [PATCH] drm/i915: Borrow our struct_mutex for the
	direct reclaim
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org
Errors-To: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org

Commit Message

Chris Wilson Oct. 10, 2012, 3:08 p.m. UTC

If we have hit oom whilst holding our struct_mutex, then currently we
cannot reap our own GPU buffers which likely pin most of memory, making
an outright OOM more likely. So if we are running in direct reclaim and
already hold the mutex, attempt to free buffers knowing that the
original function can not continue until we return.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c |   24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

Comments

Daniel Vetter Oct. 10, 2012, 9:02 p.m. UTC | #1

On Wed, Oct 10, 2012 at 5:08 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> If we have hit oom whilst holding our struct_mutex, then currently we
> cannot reap our own GPU buffers which likely pin most of memory, making
> an outright OOM more likely. So if we are running in direct reclaim and
> already hold the mutex, attempt to free buffers knowing that the
> original function can not continue until we return.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

I've thought a bit about this, and I fear the implications. It's a
very neat trick, but now every memory alloc call could potentially
result in unpinned objects getting unbound and in active objects
getting retired. Previously we only needed to fear active objects
disappearing when calling retire_request (since that could drop the
last reference). With this chance we have many more places, and given
how often we e.g. fumbled the refcounting in the fence stealing code
I'm scared ...

/me needs to think more about this

Cheers, Daniel
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1d0cbfb..bed4084 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4589,6 +4589,18 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
>         spin_unlock(&file_priv->mm.lock);
>  }
>
> +static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
> +{
> +       if (!mutex_is_locked(mutex))
> +               return false;
> +
> +#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
> +       return mutex->owner == task;
> +#else
> +       return false;
> +#endif
> +}
> +
>  static int
>  i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
>  {
> @@ -4599,10 +4611,15 @@ i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
>         struct drm_device *dev = dev_priv->dev;
>         struct drm_i915_gem_object *obj;
>         int nr_to_scan = sc->nr_to_scan;
> +       bool unlock = true;
>         int cnt;
>
> -       if (!mutex_trylock(&dev->struct_mutex))
> -               return 0;
> +       if (!mutex_trylock(&dev->struct_mutex)) {
> +               if (mutex_is_locked_by(&dev->struct_mutex, current))
> +                       unlock = false;
> +               else
> +                       return 0;
> +       }
>
>         if (nr_to_scan) {
>                 nr_to_scan -= i915_gem_purge(dev_priv, nr_to_scan);
> @@ -4618,6 +4635,7 @@ i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
>                 if (obj->pin_count == 0 && obj->pages_pin_count == 0)
>                         cnt += obj->base.size >> PAGE_SHIFT;
>
> -       mutex_unlock(&dev->struct_mutex);
> +       if (unlock)
> +               mutex_unlock(&dev->struct_mutex);
>         return cnt;
>  }
> --
> 1.7.10.4
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Ben Widawsky Nov. 8, 2012, 1:49 p.m. UTC | #2

On Wed, 10 Oct 2012 16:08:59 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> If we have hit oom whilst holding our struct_mutex, then currently we
> cannot reap our own GPU buffers which likely pin most of memory, making
> an outright OOM more likely. So if we are running in direct reclaim and
> already hold the mutex, attempt to free buffers knowing that the
> original function can not continue until we return.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

I will try to review this a bit further on the plane ride home. It's
quite a scary patch, but it fixes OOM destroying IGT runs, so it's
pretty necessary IMO.

Meanwhile:
Tested-by: Ben Widawsky <ben@bwidawsk.net>

> ---
>  drivers/gpu/drm/i915/i915_gem.c |   24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1d0cbfb..bed4084 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4589,6 +4589,18 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>  
> +static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
> +{
> +	if (!mutex_is_locked(mutex))
> +		return false;
> +
> +#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
> +	return mutex->owner == task;
> +#else
> +	return false;
> +#endif
> +}
> +
>  static int
>  i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
>  {
> @@ -4599,10 +4611,15 @@ i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
>  	struct drm_device *dev = dev_priv->dev;
>  	struct drm_i915_gem_object *obj;
>  	int nr_to_scan = sc->nr_to_scan;
> +	bool unlock = true;
>  	int cnt;
>  
> -	if (!mutex_trylock(&dev->struct_mutex))
> -		return 0;
> +	if (!mutex_trylock(&dev->struct_mutex)) {
> +		if (mutex_is_locked_by(&dev->struct_mutex, current))
> +			unlock = false;
> +		else
> +			return 0;
> +	}
>  
>  	if (nr_to_scan) {
>  		nr_to_scan -= i915_gem_purge(dev_priv, nr_to_scan);
> @@ -4618,6 +4635,7 @@ i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
>  		if (obj->pin_count == 0 && obj->pages_pin_count == 0)
>  			cnt += obj->base.size >> PAGE_SHIFT;
>  
> -	mutex_unlock(&dev->struct_mutex);
> +	if (unlock)
> +		mutex_unlock(&dev->struct_mutex);
>  	return cnt;
>  }

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1d0cbfb..bed4084 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4589,6 +4589,18 @@  void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
+{
+	if (!mutex_is_locked(mutex))
+		return false;
+
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
+	return mutex->owner == task;
+#else
+	return false;
+#endif
+}
+
 static int
 i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
 {
@@ -4599,10 +4611,15 @@  i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
 	struct drm_device *dev = dev_priv->dev;
 	struct drm_i915_gem_object *obj;
 	int nr_to_scan = sc->nr_to_scan;
+	bool unlock = true;
 	int cnt;
 
-	if (!mutex_trylock(&dev->struct_mutex))
-		return 0;
+	if (!mutex_trylock(&dev->struct_mutex)) {
+		if (mutex_is_locked_by(&dev->struct_mutex, current))
+			unlock = false;
+		else
+			return 0;
+	}
 
 	if (nr_to_scan) {
 		nr_to_scan -= i915_gem_purge(dev_priv, nr_to_scan);
@@ -4618,6 +4635,7 @@  i915_gem_inactive_shrink(struct shrinker *shrinker, struct shrink_control *sc)
 		if (obj->pin_count == 0 && obj->pages_pin_count == 0)
 			cnt += obj->base.size >> PAGE_SHIFT;
 
-	mutex_unlock(&dev->struct_mutex);
+	if (unlock)
+		mutex_unlock(&dev->struct_mutex);
 	return cnt;
 }

drm/i915: Borrow our struct_mutex for the direct reclaim

Commit Message

Comments

Patch