diff mbox

[v2] drm/i915/guc: Move GuC wq_reserve_space to alloc_request_extras

Message ID 1449687054-16903-1-git-send-email-yu.dai@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

yu.dai@intel.com Dec. 9, 2015, 6:50 p.m. UTC
From: Alex Dai <yu.dai@intel.com>

Split GuC work queue space reserve from submission and move it to
ring_alloc_request_extras. The reason is that failure in later
i915_add_request() won't be handled. In the case timeout happens,
driver can return early in order to handle the error.

v1: Move wq_reserve_space to ring_reserve_space
v2: Move wq_reserve_space to alloc_request_extras (Chris Wilson)

Signed-off-by: Alex Dai <yu.dai@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 21 +++++++++------------
 drivers/gpu/drm/i915/intel_guc.h           |  1 +
 drivers/gpu/drm/i915/intel_lrc.c           |  6 ++++++
 3 files changed, 16 insertions(+), 12 deletions(-)

Comments

Dave Gordon Dec. 10, 2015, 5:14 p.m. UTC | #1
On 09/12/15 18:50, yu.dai@intel.com wrote:
> From: Alex Dai <yu.dai@intel.com>
>
> Split GuC work queue space reserve from submission and move it to
> ring_alloc_request_extras. The reason is that failure in later
> i915_add_request() won't be handled. In the case timeout happens,
> driver can return early in order to handle the error.
>
> v1: Move wq_reserve_space to ring_reserve_space
> v2: Move wq_reserve_space to alloc_request_extras (Chris Wilson)
>
> Signed-off-by: Alex Dai <yu.dai@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_guc_submission.c | 21 +++++++++------------
>   drivers/gpu/drm/i915/intel_guc.h           |  1 +
>   drivers/gpu/drm/i915/intel_lrc.c           |  6 ++++++
>   3 files changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 226e9c0..f7bd038 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -472,25 +472,21 @@ static void guc_fini_ctx_desc(struct intel_guc *guc,
>   			     sizeof(desc) * client->ctx_index);
>   }
>
> -/* Get valid workqueue item and return it back to offset */
> -static int guc_get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
> +int i915_guc_wq_reserve_space(struct i915_guc_client *gc)

I think the name is misleading, because we don't actually reserve 
anything here, just check that there is some free space in the WQ.

(We certainly don't WANT to reserve anything, because that would be 
difficult to clean up in the event of submission failure for any other 
reason. So I think it's only the name needs changing. Although ...

>   {
>   	struct guc_process_desc *desc;
>   	void *base;
>   	u32 size = sizeof(struct guc_wq_item);
>   	int ret = -ETIMEDOUT, timeout_counter = 200;
>
> +	if (!gc)
> +		return 0;
> +
>   	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
>   	desc = base + gc->proc_desc_offset;
>
>   	while (timeout_counter-- > 0) {
>   		if (CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size) >= size) {

... as an alternative strategy, we could cache the calculated freespace 
in the client structure; then if we already know there's at least 1 slot 
free from last time we checked, we could then just decrement the cached 
value and avoid the kmap+spinwait overhead. Only when we reach 0 would 
we have to go through this code to refresh our view of desc->head and 
recalculate the actual current freespace. [NB: clear cached value on reset?]

Does that sound like a useful optimisation?

> -			*offset = gc->wq_tail;
> -
> -			/* advance the tail for next workqueue item */
> -			gc->wq_tail += size;
> -			gc->wq_tail &= gc->wq_size - 1;
> -
>   			/* this will break the loop */
>   			timeout_counter = 0;
>   			ret = 0;
> @@ -512,11 +508,12 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc,
>   	struct guc_wq_item *wqi;
>   	void *base;
>   	u32 tail, wq_len, wq_off = 0;
> -	int ret;
>
> -	ret = guc_get_workqueue_space(gc, &wq_off);
> -	if (ret)
> -		return ret;
> +	wq_off = gc->wq_tail;
> +
> +	/* advance the tail for next workqueue item */
> +	gc->wq_tail += sizeof(struct guc_wq_item);
> +	gc->wq_tail &= gc->wq_size - 1;

I was a bit unhappy about this code just assuming that there *must* be 
space (because we KNOW we've checked above) -- unless someone violated 
the proper calling sequence (TDR?). OTOH, it would be too expensive to 
go through the map-and-calculate code all over again just to catch an 
unlikely scenario. But, if we cache the last-calculated value as above, 
then the check could be cheap :) For example, just add a pre_checked 
size field that's set by the pre-check and then checked and decremented 
on submission; there shouldn't be more than one submission in progress 
at a time, because dev->struct_mutex is held across the whole sequence 
(but it's not an error to see two pre-checks in a row, because a request 
can be abandoned partway).

>   	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
>   	 * should not have the case where structure wqi is across page, neither
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index 0e048bf..59c8e21 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -123,5 +123,6 @@ int i915_guc_submit(struct i915_guc_client *client,
>   		    struct drm_i915_gem_request *rq);
>   void i915_guc_submission_disable(struct drm_device *dev);
>   void i915_guc_submission_fini(struct drm_device *dev);
> +int i915_guc_wq_reserve_space(struct i915_guc_client *client);
>
>   #endif
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index f96fb51..7d53d27 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -667,6 +667,12 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
>   			return ret;
>   	}
>
> +	/* Reserve GuC WQ space here (one request needs one WQ item) because
> +	 * the later i915_add_request() call can't fail. */
> +	ret = i915_guc_wq_reserve_space(request->i915->guc.execbuf_client);
> +	if (ret)
> +		return ret;
> +
>   	return 0;
>   }

Worth checking for GuC submission before that call? Maybe not ...

.Dave.
yu.dai@intel.com Dec. 11, 2015, 12:22 a.m. UTC | #2
On 12/10/2015 09:14 AM, Dave Gordon wrote:
> On 09/12/15 18:50, yu.dai@intel.com wrote:
> > From: Alex Dai <yu.dai@intel.com>
> >
> > Split GuC work queue space reserve from submission and move it to
> > ring_alloc_request_extras. The reason is that failure in later
> > i915_add_request() won't be handled. In the case timeout happens,
> > driver can return early in order to handle the error.
> >
> > v1: Move wq_reserve_space to ring_reserve_space
> > v2: Move wq_reserve_space to alloc_request_extras (Chris Wilson)
> >
> > Signed-off-by: Alex Dai <yu.dai@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_guc_submission.c | 21 +++++++++------------
> >   drivers/gpu/drm/i915/intel_guc.h           |  1 +
> >   drivers/gpu/drm/i915/intel_lrc.c           |  6 ++++++
> >   3 files changed, 16 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> > index 226e9c0..f7bd038 100644
> > --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > @@ -472,25 +472,21 @@ static void guc_fini_ctx_desc(struct intel_guc *guc,
> >   			     sizeof(desc) * client->ctx_index);
> >   }
> >
> > -/* Get valid workqueue item and return it back to offset */
> > -static int guc_get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
> > +int i915_guc_wq_reserve_space(struct i915_guc_client *gc)
>
> I think the name is misleading, because we don't actually reserve
> anything here, just check that there is some free space in the WQ.
>
> (We certainly don't WANT to reserve anything, because that would be
> difficult to clean up in the event of submission failure for any other
> reason. So I think it's only the name needs changing. Although ...

I was trying to use similar name as ring_reserve_space. It follows the 
same pattern as filling ring buffer - reserve, emit and advance. The 
only difference is it reserves 4 dwords (or checks space) rather than 
various size of bytes for ring buffer case. Maybe using 'check' is 
better for this case because 4 dwords in wq is what we need for 
submission via GuC.
> >   {
> >   	struct guc_process_desc *desc;
> >   	void *base;
> >   	u32 size = sizeof(struct guc_wq_item);
> >   	int ret = -ETIMEDOUT, timeout_counter = 200;
> >
> > +	if (!gc)
> > +		return 0;
> > +
> >   	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
> >   	desc = base + gc->proc_desc_offset;
> >
> >   	while (timeout_counter-- > 0) {
> >   		if (CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size) >= size) {
>
> ... as an alternative strategy, we could cache the calculated freespace
> in the client structure; then if we already know there's at least 1 slot
> free from last time we checked, we could then just decrement the cached
> value and avoid the kmap+spinwait overhead. Only when we reach 0 would
> we have to go through this code to refresh our view of desc->head and
> recalculate the actual current freespace. [NB: clear cached value on reset?]
>
> Does that sound like a useful optimisation?

I think it is a good idea.
> > -			*offset = gc->wq_tail;
> > -
> > -			/* advance the tail for next workqueue item */
> > -			gc->wq_tail += size;
> > -			gc->wq_tail &= gc->wq_size - 1;
> > -
> >   			/* this will break the loop */
> >   			timeout_counter = 0;
> >   			ret = 0;
> > @@ -512,11 +508,12 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc,
> >   	struct guc_wq_item *wqi;
> >   	void *base;
> >   	u32 tail, wq_len, wq_off = 0;
> > -	int ret;
> >
> > -	ret = guc_get_workqueue_space(gc, &wq_off);
> > -	if (ret)
> > -		return ret;
> > +	wq_off = gc->wq_tail;
> > +
> > +	/* advance the tail for next workqueue item */
> > +	gc->wq_tail += sizeof(struct guc_wq_item);
> > +	gc->wq_tail &= gc->wq_size - 1;
>
> I was a bit unhappy about this code just assuming that there *must* be
> space (because we KNOW we've checked above) -- unless someone violated
> the proper calling sequence (TDR?). OTOH, it would be too expensive to
> go through the map-and-calculate code all over again just to catch an
> unlikely scenario. But, if we cache the last-calculated value as above,
> then the check could be cheap :) For example, just add a pre_checked
> size field that's set by the pre-check and then checked and decremented
> on submission; there shouldn't be more than one submission in progress
> at a time, because dev->struct_mutex is held across the whole sequence
> (but it's not an error to see two pre-checks in a row, because a request
> can be abandoned partway).

I don't understand the concerns here. As I said above, filling GuC WQ is 
same as filling ring buffer - reserve (check), emit and advance. These 
two lines are GuC version of intel_logical_ring_emit and 
intel_logical_ring_advance. Maybe if I move these two lines to end of 
guc_add_workqueue_item, people can understand it more easily.

> >   	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
> >   	 * should not have the case where structure wqi is across page, neither
> > diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> > index 0e048bf..59c8e21 100644
> > --- a/drivers/gpu/drm/i915/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/intel_guc.h
> > @@ -123,5 +123,6 @@ int i915_guc_submit(struct i915_guc_client *client,
> >   		    struct drm_i915_gem_request *rq);
> >   void i915_guc_submission_disable(struct drm_device *dev);
> >   void i915_guc_submission_fini(struct drm_device *dev);
> > +int i915_guc_wq_reserve_space(struct i915_guc_client *client);
> >
> >   #endif
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index f96fb51..7d53d27 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -667,6 +667,12 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
> >   			return ret;
> >   	}
> >
> > +	/* Reserve GuC WQ space here (one request needs one WQ item) because
> > +	 * the later i915_add_request() call can't fail. */
> > +	ret = i915_guc_wq_reserve_space(request->i915->guc.execbuf_client);
> > +	if (ret)
> > +		return ret;
> > +
> >   	return 0;
> >   }
>
> Worth checking for GuC submission before that call? Maybe not ...

It is purely code alignment issue if I add a if() block. The 
execbuf_client is valid only when GuC submission is enabled and the 
check is done inside that function call.

Alex
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 226e9c0..f7bd038 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -472,25 +472,21 @@  static void guc_fini_ctx_desc(struct intel_guc *guc,
 			     sizeof(desc) * client->ctx_index);
 }
 
-/* Get valid workqueue item and return it back to offset */
-static int guc_get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
+int i915_guc_wq_reserve_space(struct i915_guc_client *gc)
 {
 	struct guc_process_desc *desc;
 	void *base;
 	u32 size = sizeof(struct guc_wq_item);
 	int ret = -ETIMEDOUT, timeout_counter = 200;
 
+	if (!gc)
+		return 0;
+
 	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
 	desc = base + gc->proc_desc_offset;
 
 	while (timeout_counter-- > 0) {
 		if (CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size) >= size) {
-			*offset = gc->wq_tail;
-
-			/* advance the tail for next workqueue item */
-			gc->wq_tail += size;
-			gc->wq_tail &= gc->wq_size - 1;
-
 			/* this will break the loop */
 			timeout_counter = 0;
 			ret = 0;
@@ -512,11 +508,12 @@  static int guc_add_workqueue_item(struct i915_guc_client *gc,
 	struct guc_wq_item *wqi;
 	void *base;
 	u32 tail, wq_len, wq_off = 0;
-	int ret;
 
-	ret = guc_get_workqueue_space(gc, &wq_off);
-	if (ret)
-		return ret;
+	wq_off = gc->wq_tail;
+
+	/* advance the tail for next workqueue item */
+	gc->wq_tail += sizeof(struct guc_wq_item);
+	gc->wq_tail &= gc->wq_size - 1;
 
 	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
 	 * should not have the case where structure wqi is across page, neither
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 0e048bf..59c8e21 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -123,5 +123,6 @@  int i915_guc_submit(struct i915_guc_client *client,
 		    struct drm_i915_gem_request *rq);
 void i915_guc_submission_disable(struct drm_device *dev);
 void i915_guc_submission_fini(struct drm_device *dev);
+int i915_guc_wq_reserve_space(struct i915_guc_client *client);
 
 #endif
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f96fb51..7d53d27 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -667,6 +667,12 @@  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 			return ret;
 	}
 
+	/* Reserve GuC WQ space here (one request needs one WQ item) because
+	 * the later i915_add_request() call can't fail. */
+	ret = i915_guc_wq_reserve_space(request->i915->guc.execbuf_client);
+	if (ret)
+		return ret;
+
 	return 0;
 }