diff mbox

[v2,1/3] drm/i915: Enable lockless lookup of request tracking via RCU

Message ID 1450877756-2902-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 23, 2015, 1:35 p.m. UTC
If we enable RCU for the requests (providing a grace period where we can
inspect a "dead" request before it is freed), we can allow callers to
carefully perform lockless lookup of an active request.

However, by enabling deferred freeing of requests, we can potentially
hog a lot of memory when dealing with tens of thousands of requests per
second - with a quick insertion of the a synchronize_rcu() inside our
shrinker callback, that issue disappears.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c          |  3 ++-
 drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h  | 24 +++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_shrinker.c |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

Comments

Daniel Vetter Jan. 5, 2016, 2:59 p.m. UTC | #1
On Wed, Dec 23, 2015 at 01:35:54PM +0000, Chris Wilson wrote:
> If we enable RCU for the requests (providing a grace period where we can
> inspect a "dead" request before it is freed), we can allow callers to
> carefully perform lockless lookup of an active request.
> 
> However, by enabling deferred freeing of requests, we can potentially
> hog a lot of memory when dealing with tens of thousands of requests per
> second - with a quick insertion of the a synchronize_rcu() inside our
> shrinker callback, that issue disappears.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem.c          |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
>  drivers/gpu/drm/i915/i915_gem_request.h  | 24 +++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_shrinker.c |  1 +
>  4 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c169574758d5..696ada3891ed 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4222,7 +4222,8 @@ i915_gem_load(struct drm_device *dev)
>  	dev_priv->requests =
>  		kmem_cache_create("i915_gem_request",
>  				  sizeof(struct drm_i915_gem_request), 0,
> -				  SLAB_HWCACHE_ALIGN,
> +				  SLAB_HWCACHE_ALIGN |
> +				  SLAB_DESTROY_BY_RCU,
>  				  NULL);
>  
>  	INIT_LIST_HEAD(&dev_priv->context_list);

[snip i915 private changes, leave just slab/shrinker changes]

> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index c561ed2b8287..03a8bbb3e31e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -142,6 +142,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
>  	}
>  
>  	i915_gem_retire_requests(dev_priv->dev);
> +	synchronize_rcu(); /* expedite the grace period to free the requests */

Shouldn't the slab subsystem do this for us if we request it delays the
actual kfree? Seems like a core bug to me ... Adding more folks.
-Daniel
Peter Zijlstra Jan. 5, 2016, 3:02 p.m. UTC | #2
On Tue, Jan 05, 2016 at 03:59:51PM +0100, Daniel Vetter wrote:
> On Wed, Dec 23, 2015 at 01:35:54PM +0000, Chris Wilson wrote:
> > If we enable RCU for the requests (providing a grace period where we can
> > inspect a "dead" request before it is freed), we can allow callers to
> > carefully perform lockless lookup of an active request.
> > 
> > However, by enabling deferred freeing of requests, we can potentially
> > hog a lot of memory when dealing with tens of thousands of requests per
> > second - with a quick insertion of the a synchronize_rcu() inside our
> > shrinker callback, that issue disappears.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c          |  3 ++-
> >  drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
> >  drivers/gpu/drm/i915/i915_gem_request.h  | 24 +++++++++++++++++++++++-
> >  drivers/gpu/drm/i915/i915_gem_shrinker.c |  1 +
> >  4 files changed, 27 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index c169574758d5..696ada3891ed 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -4222,7 +4222,8 @@ i915_gem_load(struct drm_device *dev)
> >  	dev_priv->requests =
> >  		kmem_cache_create("i915_gem_request",
> >  				  sizeof(struct drm_i915_gem_request), 0,
> > -				  SLAB_HWCACHE_ALIGN,
> > +				  SLAB_HWCACHE_ALIGN |
> > +				  SLAB_DESTROY_BY_RCU,
> >  				  NULL);
> >  
> >  	INIT_LIST_HEAD(&dev_priv->context_list);
> 
> [snip i915 private changes, leave just slab/shrinker changes]
> 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > index c561ed2b8287..03a8bbb3e31e 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > @@ -142,6 +142,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
> >  	}
> >  
> >  	i915_gem_retire_requests(dev_priv->dev);
> > +	synchronize_rcu(); /* expedite the grace period to free the requests */
> 
> Shouldn't the slab subsystem do this for us if we request it delays the
> actual kfree? Seems like a core bug to me ... Adding more folks.

note that sync_rcu() can take a terribly long time.. but yes, I seem to
remember Paul talking about adding this to reclaim paths for just this
reason. Not sure that ever happened thouhg.
Peter Zijlstra Jan. 5, 2016, 3:06 p.m. UTC | #3
On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > Shouldn't the slab subsystem do this for us if we request it delays the
> > actual kfree? Seems like a core bug to me ... Adding more folks.
> 
> note that sync_rcu() can take a terribly long time.. but yes, I seem to
> remember Paul talking about adding this to reclaim paths for just this
> reason. Not sure that ever happened thouhg.

Also, you might be wanting rcu_barrier() instead, that not only waits
for a GP to complete, but also for all pending callbacks to be
processed.

Without the latter there might still not be anything to free after it.
Paul E. McKenney Jan. 5, 2016, 4:35 p.m. UTC | #4
On Tue, Jan 05, 2016 at 04:06:48PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > > Shouldn't the slab subsystem do this for us if we request it delays the
> > > actual kfree? Seems like a core bug to me ... Adding more folks.
> > 
> > note that sync_rcu() can take a terribly long time.. but yes, I seem to
> > remember Paul talking about adding this to reclaim paths for just this
> > reason. Not sure that ever happened thouhg.

There is an RCU OOM notifier, but it just ensures that existing callbacks
get processed in a timely fashion.  It does not block, as that would
prevent other OOM notifiers from getting their memory freed quickly.

> Also, you might be wanting rcu_barrier() instead, that not only waits
> for a GP to complete, but also for all pending callbacks to be
> processed.

And in fact what the RCU OOM notifier does can be thought of as an
asynchronous open-coded rcu_barrier().  If you are interested, please
see rcu_register_oom_notifier() and friends.

> Without the latter there might still not be anything to free after it.

Another approach is synchronize_rcu() after some largish number of
requests.  The advantage of this approach is that it throttles the
production of callbacks at the source.  The corresponding disadvantage
is that it slows things up.

Another approach is to use call_rcu(), but if the previous call_rcu()
is still in flight, block waiting for it.  Yet another approach is
the get_state_synchronize_rcu() / cond_synchronize_rcu() pair.  The
idea is to do something like this:

	cond_synchronize_rcu(cookie);
	cookie = get_state_synchronize_rcu();

You would of course do an initial get_state_synchronize_rcu() to
get things going.  This would not block unless there was less than
one grace period's worth of time between invocations.  But this
assumes a busy system, where there is almost always a grace period
in flight.  But you can make that happen as follows:

	cond_synchronize_rcu(cookie);
	cookie = get_state_synchronize_rcu();
	call_rcu(&my_rcu_head, noop_function);

Note that you need additional code to make sure that the old callback
has completed before doing a new one.  Setting and clearing a flag
with appropriate memory ordering control suffices (e.g,. smp_load_acquire()
and smp_store_release()).

And there are probably other approaches as well...

							Thanx, Paul
Daniel Vetter Jan. 6, 2016, 8:06 a.m. UTC | #5
On Tue, Jan 05, 2016 at 08:35:37AM -0800, Paul E. McKenney wrote:
> On Tue, Jan 05, 2016 at 04:06:48PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > > > Shouldn't the slab subsystem do this for us if we request it delays the
> > > > actual kfree? Seems like a core bug to me ... Adding more folks.
> > > 
> > > note that sync_rcu() can take a terribly long time.. but yes, I seem to
> > > remember Paul talking about adding this to reclaim paths for just this
> > > reason. Not sure that ever happened thouhg.
> 
> There is an RCU OOM notifier, but it just ensures that existing callbacks
> get processed in a timely fashion.  It does not block, as that would
> prevent other OOM notifiers from getting their memory freed quickly.
> 
> > Also, you might be wanting rcu_barrier() instead, that not only waits
> > for a GP to complete, but also for all pending callbacks to be
> > processed.
> 
> And in fact what the RCU OOM notifier does can be thought of as an
> asynchronous open-coded rcu_barrier().  If you are interested, please
> see rcu_register_oom_notifier() and friends.
> 
> > Without the latter there might still not be anything to free after it.
> 
> Another approach is synchronize_rcu() after some largish number of
> requests.  The advantage of this approach is that it throttles the
> production of callbacks at the source.  The corresponding disadvantage
> is that it slows things up.
> 
> Another approach is to use call_rcu(), but if the previous call_rcu()
> is still in flight, block waiting for it.  Yet another approach is
> the get_state_synchronize_rcu() / cond_synchronize_rcu() pair.  The
> idea is to do something like this:
> 
> 	cond_synchronize_rcu(cookie);
> 	cookie = get_state_synchronize_rcu();
> 
> You would of course do an initial get_state_synchronize_rcu() to
> get things going.  This would not block unless there was less than
> one grace period's worth of time between invocations.  But this
> assumes a busy system, where there is almost always a grace period
> in flight.  But you can make that happen as follows:
> 
> 	cond_synchronize_rcu(cookie);
> 	cookie = get_state_synchronize_rcu();
> 	call_rcu(&my_rcu_head, noop_function);
> 
> Note that you need additional code to make sure that the old callback
> has completed before doing a new one.  Setting and clearing a flag
> with appropriate memory ordering control suffices (e.g,. smp_load_acquire()
> and smp_store_release()).

This pretty much went over my head ;-) What I naively hoped for is that
kfree() on an rcu-freeing slab could be tought to magically stall a bit
(or at least expedite the delayed freeing) if we're piling up too many
freed objects. Doing that only in OOM is probably too late since OOM
handling is a bit unreliable/unpredictable. And I thought we're not the
first ones running into this problem.

Do all the other users of rcu-freed slabs just open-code their own custom
approach? If that's the recommendation we can certainly follow that, too.
-Daniel
Peter Zijlstra Jan. 6, 2016, 8:38 a.m. UTC | #6
On Wed, Jan 06, 2016 at 09:06:58AM +0100, Daniel Vetter wrote:
> This pretty much went over my head ;-) What I naively hoped for is that
> kfree() on an rcu-freeing slab could be tought to magically stall a bit
> (or at least expedite the delayed freeing) if we're piling up too many
> freed objects.

Well, RCU does try harder when the callback list is getting 'big' (10k
IIRC).

> Doing that only in OOM is probably too late since OOM
> handling is a bit unreliable/unpredictable. And I thought we're not the
> first ones running into this problem.

The whole memory pressure thing is unreliable/unpredictable last time I
looked at it, but sure, I suppose we could try and poke RCU sooner, but
then you get into the problem of when, doing it too soon will be
detrimental to performance, doing it too late is, well, too late.

> Do all the other users of rcu-freed slabs just open-code their own custom
> approach? If that's the recommendation we can certainly follow that, too.

The ones I know of seem to simply ignore this problem..
Paul E. McKenney Jan. 6, 2016, 3:56 p.m. UTC | #7
On Wed, Jan 06, 2016 at 09:38:30AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 06, 2016 at 09:06:58AM +0100, Daniel Vetter wrote:
> > This pretty much went over my head ;-) What I naively hoped for is that
> > kfree() on an rcu-freeing slab could be tought to magically stall a bit
> > (or at least expedite the delayed freeing) if we're piling up too many
> > freed objects.
> 
> Well, RCU does try harder when the callback list is getting 'big' (10k
> IIRC).

You got it, 10k by default, can be adjusted with the rcutree.qhimark
kernel-boot/sysfs parameter.  When a given CPU's callback list exceeds
this limit, it more aggressively starts a grace period, and if a grace
period is already in progress, it does more aggressive quiescent-state
forcing.  It does nothing to push back on processes generating callbacks,
other than by soaking up extra CPU cycles.

So, Daniel, if you haven't tried hammering the system hard, give it a
shot and see if qhimark is helping enough.  And perhaps adjust its value
if need be.  (Though please let me know if this is necessary -- if it is,
we should try to automate its setting.)

> > Doing that only in OOM is probably too late since OOM
> > handling is a bit unreliable/unpredictable. And I thought we're not the
> > first ones running into this problem.
> 
> The whole memory pressure thing is unreliable/unpredictable last time I
> looked at it, but sure, I suppose we could try and poke RCU sooner, but
> then you get into the problem of when, doing it too soon will be
> detrimental to performance, doing it too late is, well, too late.
> 
> > Do all the other users of rcu-freed slabs just open-code their own custom
> > approach? If that's the recommendation we can certainly follow that, too.
> 
> The ones I know of seem to simply ignore this problem..

I believe that there are a few that do the occasional synchronize_rcu()
to throttle themselves, but have not checked recently.

							Thanx, Paul
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c169574758d5..696ada3891ed 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4222,7 +4222,8 @@  i915_gem_load(struct drm_device *dev)
 	dev_priv->requests =
 		kmem_cache_create("i915_gem_request",
 				  sizeof(struct drm_i915_gem_request), 0,
-				  SLAB_HWCACHE_ALIGN,
+				  SLAB_HWCACHE_ALIGN |
+				  SLAB_DESTROY_BY_RCU,
 				  NULL);
 
 	INIT_LIST_HEAD(&dev_priv->context_list);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 4e087143b1d2..380a5963d957 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -335,7 +335,7 @@  static void __i915_gem_request_retire_active(struct drm_i915_gem_request *req)
 		 * and pass along the auxiliary information.
 		 */
 		INIT_LIST_HEAD(&active->link);
-		active->request = NULL;
+		rcu_assign_pointer(active->request, NULL);
 
 		active->retire(active, req);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 4eb15ed9205e..fae85b111a1e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -139,6 +139,12 @@  i915_gem_request_get(struct drm_i915_gem_request *req)
 	return to_request(fence_get(&req->fence));
 }
 
+static inline struct drm_i915_gem_request *
+i915_gem_request_get_rcu(struct drm_i915_gem_request *req)
+{
+	return to_request(fence_get_rcu(&req->fence));
+}
+
 static inline void
 i915_gem_request_put(struct drm_i915_gem_request *req)
 {
@@ -243,7 +249,23 @@  i915_gem_request_mark_active(struct drm_i915_gem_request *request,
 			     struct i915_gem_active *active)
 {
 	list_move(&active->link, &request->active_list);
-	active->request = request;
+	rcu_assign_pointer(active->request, request);
+}
+
+static inline struct drm_i915_gem_request *
+i915_gem_active_get_request_rcu(struct i915_gem_active *active)
+{
+	do {
+		struct drm_i915_gem_request *request;
+
+		request = rcu_dereference(active->request);
+		if (request == NULL)
+			return NULL;
+
+		request = i915_gem_request_get_rcu(request);
+		if (request)
+			return request;
+	} while (1);
 }
 
 #endif /* I915_GEM_REQUEST_H */
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index c561ed2b8287..03a8bbb3e31e 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -142,6 +142,7 @@  i915_gem_shrink(struct drm_i915_private *dev_priv,
 	}
 
 	i915_gem_retire_requests(dev_priv->dev);
+	synchronize_rcu(); /* expedite the grace period to free the requests */
 
 	return count;
 }