diff mbox series

[3/3] drm/i915: Wait for the previous RCU grace period, not request completion

Message ID 20180912164015.28362-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915: Limit the backpressure for i915_request allocation | expand

Commit Message

Chris Wilson Sept. 12, 2018, 4:40 p.m. UTC
Under mempressure, our goal is to allow ourselves sufficient time to
reclaim the RCU protected slabs without overly penalizing our clients.
Currently, we use a 1 jiffie wait if the client is still active as a
means of throttling the allocations, but we can instead wait for the end
of the RCU grace period of the clients previous allocation.

Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_request.c | 14 ++++++--------
 drivers/gpu/drm/i915/i915_request.h |  8 ++++++++
 2 files changed, 14 insertions(+), 8 deletions(-)

Comments

Tvrtko Ursulin Sept. 13, 2018, 11:16 a.m. UTC | #1
On 12/09/2018 17:40, Chris Wilson wrote:
> Under mempressure, our goal is to allow ourselves sufficient time to
> reclaim the RCU protected slabs without overly penalizing our clients.
> Currently, we use a 1 jiffie wait if the client is still active as a
> means of throttling the allocations, but we can instead wait for the end
> of the RCU grace period of the clients previous allocation.

Why did you opt for three patches changing the same code and just squash 
to last?

Regards,

Tvrtko

> Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>   drivers/gpu/drm/i915/i915_request.c | 14 ++++++--------
>   drivers/gpu/drm/i915/i915_request.h |  8 ++++++++
>   2 files changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 72bcb4ca0c45..a492385b2089 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -732,17 +732,13 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	rq = kmem_cache_alloc(i915->requests,
>   			      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
>   	if (unlikely(!rq)) {
> +		i915_retire_requests(i915);
> +
>   		/* Ratelimit ourselves to prevent oom from malicious clients */
>   		rq = i915_gem_active_raw(&ce->ring->timeline->last_request,
>   					 &i915->drm.struct_mutex);
> -		if (rq && i915_request_wait(rq,
> -					    I915_WAIT_LOCKED |
> -					    I915_WAIT_INTERRUPTIBLE,
> -					    1) == -EINTR) {
> -			ret = -EINTR;
> -			goto err_unreserve;
> -		}
> -		i915_retire_requests(i915);
> +		if (rq)
> +			cond_synchronize_rcu(rq->rcustate);
>   
>   		/*
>   		 * We've forced the client to stall and catch up with whatever
> @@ -762,6 +758,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   		}
>   	}
>   
> +	rq->rcustate = get_state_synchronize_rcu();
> +
>   	INIT_LIST_HEAD(&rq->active_list);
>   	rq->i915 = i915;
>   	rq->engine = engine;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 9898301ab7ef..7fa94b024968 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -100,6 +100,14 @@ struct i915_request {
>   	struct i915_timeline *timeline;
>   	struct intel_signal_node signaling;
>   
> +	/*
> +	 * The rcu epoch of when this request was allocated. Used to judiciously
> +	 * apply backpressure on future allocations to ensure that under
> +	 * mempressure there is sufficient RCU ticks for us to reclaim our
> +	 * RCU protected slabs.
> +	 */
> +	unsigned long rcustate;
> +
>   	/*
>   	 * Fences for the various phases in the request's lifetime.
>   	 *
>
Chris Wilson Sept. 13, 2018, 11:18 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-09-13 12:16:42)
> 
> On 12/09/2018 17:40, Chris Wilson wrote:
> > Under mempressure, our goal is to allow ourselves sufficient time to
> > reclaim the RCU protected slabs without overly penalizing our clients.
> > Currently, we use a 1 jiffie wait if the client is still active as a
> > means of throttling the allocations, but we can instead wait for the end
> > of the RCU grace period of the clients previous allocation.
> 
> Why did you opt for three patches changing the same code and just squash 
> to last?

1 introduced a timeout, 2 limited it to the single timeline, 3 changed
what we are waiting on entirely. Each of those are big jumps, and only
(1) is required to fix the bug.
-Chris
Tvrtko Ursulin Sept. 13, 2018, 11:29 a.m. UTC | #3
On 13/09/2018 12:18, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-09-13 12:16:42)
>>
>> On 12/09/2018 17:40, Chris Wilson wrote:
>>> Under mempressure, our goal is to allow ourselves sufficient time to
>>> reclaim the RCU protected slabs without overly penalizing our clients.
>>> Currently, we use a 1 jiffie wait if the client is still active as a
>>> means of throttling the allocations, but we can instead wait for the end
>>> of the RCU grace period of the clients previous allocation.
>>
>> Why did you opt for three patches changing the same code and just squash
>> to last?
> 
> 1 introduced a timeout, 2 limited it to the single timeline, 3 changed
> what we are waiting on entirely. Each of those are big jumps, and only
> (1) is required to fix the bug.

I completely understand that, just question why we want to review all 
the intermediate solutions only to end up with superior one in terms of 
both logic, design and simplicity.

Because as said before, I don't really approve of patch 1 that much, 
even if it fixes a bug.

Two is already superior, but three is right to the point of what problem 
you want to solve. (Even if I haven't looked into the exact RCU API yet, 
but it looks believable.)

Anyway, I'll let the other guys comment, maybe it is just me who is too 
picky.

Regards,

Tvrtko
Chris Wilson Sept. 13, 2018, 11:35 a.m. UTC | #4
Quoting Tvrtko Ursulin (2018-09-13 12:29:46)
> 
> On 13/09/2018 12:18, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-09-13 12:16:42)
> >>
> >> On 12/09/2018 17:40, Chris Wilson wrote:
> >>> Under mempressure, our goal is to allow ourselves sufficient time to
> >>> reclaim the RCU protected slabs without overly penalizing our clients.
> >>> Currently, we use a 1 jiffie wait if the client is still active as a
> >>> means of throttling the allocations, but we can instead wait for the end
> >>> of the RCU grace period of the clients previous allocation.
> >>
> >> Why did you opt for three patches changing the same code and just squash
> >> to last?
> > 
> > 1 introduced a timeout, 2 limited it to the single timeline, 3 changed
> > what we are waiting on entirely. Each of those are big jumps, and only
> > (1) is required to fix the bug.
> 
> I completely understand that, just question why we want to review all 
> the intermediate solutions only to end up with superior one in terms of 
> both logic, design and simplicity.

Depends on viewpoint.
 
> Because as said before, I don't really approve of patch 1 that much, 
> even if it fixes a bug.
> 
> Two is already superior, but three is right to the point of what problem 
> you want to solve. (Even if I haven't looked into the exact RCU API yet, 
> but it looks believable.)

2 mixes global/local without any clue as to whether local is a good
idea. I think that switch deserves argument (because what good is
pretending to only check the local client when there's a massive global
bottleneck in the following lines).

The switch over to using waiting a single grace period itself is also
dubious, because there is even less to link that back to gpu behaviour and
that I feel may be more crucial than the handwaving in (1) gives credit
for.

And then there are the shivers that come from having a big global
barrier in something that needs to learn to be lean and scalable.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 72bcb4ca0c45..a492385b2089 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -732,17 +732,13 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq = kmem_cache_alloc(i915->requests,
 			      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
 	if (unlikely(!rq)) {
+		i915_retire_requests(i915);
+
 		/* Ratelimit ourselves to prevent oom from malicious clients */
 		rq = i915_gem_active_raw(&ce->ring->timeline->last_request,
 					 &i915->drm.struct_mutex);
-		if (rq && i915_request_wait(rq,
-					    I915_WAIT_LOCKED |
-					    I915_WAIT_INTERRUPTIBLE,
-					    1) == -EINTR) {
-			ret = -EINTR;
-			goto err_unreserve;
-		}
-		i915_retire_requests(i915);
+		if (rq)
+			cond_synchronize_rcu(rq->rcustate);
 
 		/*
 		 * We've forced the client to stall and catch up with whatever
@@ -762,6 +758,8 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 		}
 	}
 
+	rq->rcustate = get_state_synchronize_rcu();
+
 	INIT_LIST_HEAD(&rq->active_list);
 	rq->i915 = i915;
 	rq->engine = engine;
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 9898301ab7ef..7fa94b024968 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -100,6 +100,14 @@  struct i915_request {
 	struct i915_timeline *timeline;
 	struct intel_signal_node signaling;
 
+	/*
+	 * The rcu epoch of when this request was allocated. Used to judiciously
+	 * apply backpressure on future allocations to ensure that under
+	 * mempressure there is sufficient RCU ticks for us to reclaim our
+	 * RCU protected slabs.
+	 */
+	unsigned long rcustate;
+
 	/*
 	 * Fences for the various phases in the request's lifetime.
 	 *