diff mbox series

drm/i915/guc: Check for ct enabled while waiting for response

Message ID 20220616220158.15778-1-zhanjun.dong@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/guc: Check for ct enabled while waiting for response | expand

Commit Message

Zhanjun Dong June 16, 2022, 10:01 p.m. UTC
We are seeing error message of "No response for request". Some cases
happened while waiting for response and reset/suspend action was triggered.
In this case, no response is not an error, active requests will be
cancelled.

This patch will handle this condition and change the error message into
debug message.

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 ++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

Comments

Ashutosh Dixit June 17, 2022, 4:42 a.m. UTC | #1
On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote:
>
> We are seeing error message of "No response for request". Some cases
> happened while waiting for response and reset/suspend action was triggered.
> In this case, no response is not an error, active requests will be
> cancelled.
>
> This patch will handle this condition and change the error message into
> debug message.

The convention we follow in drm is to record the version of the patch and
what changed in that version.

Generally I am ok with this version of the patch but still have a couple of
questions.

> -static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status)
>  {
>	int err;
> +	bool ct_enabled;
>
>	/*
>	 * Fast commands should complete in less than 10us, so sample quickly
> @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>  #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10
>  #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000
>  #define done \
> -	(FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
> +	(!(ct_enabled = intel_guc_ct_enabled(ct)) || \
> +	 FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
>	 GUC_HXG_ORIGIN_GUC)
>	err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS);
>	if (err)
>		err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS);
>  #undef done
> +	if (!ct_enabled)
> +		err = -ECANCELED;

So we have the choice of either setting the request status here as I was
suggesting earlier, e.g. as follows:

	#define   GUC_HXG_TYPE_REQUEST_CANCELED        4u // unused value

	if (!ct_enabled)
		req->status = GUC_HXG_TYPE_REQUEST_CANCELED;

We would return 0 in this case and would check for the req->status value
above where needed.

Or we can return -ECANCELED. I don't know if -ECANCELED is the right value
to return but whatever we return will have to be unique (ununsed elsewhere)
since we are relying on the return value. -ECANCELED is unique so that part
is ok.

Do other reviewers have a preference whether we should set req->status or
return a unique return value?

>	*status = req->status;
>	return err;
> @@ -703,11 +708,15 @@ static int ct_send(struct intel_guc_ct *ct,
>
>	intel_guc_notify(ct_to_guc(ct));
>
> -	err = wait_for_ct_request_update(&request, status);
> +	err = wait_for_ct_request_update(ct, &request, status);
>	g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
>	if (unlikely(err)) {
> -		CT_ERROR(ct, "No response for request %#x (fence %u)\n",
> -			 action[0], request.fence);
> +		if (err == -ECANCELED)
> +			CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n",
> +				 action[0], request.fence);
> +		else
> +			CT_ERROR(ct, "No response for request %#x (fence %u)\n",
> +				 action[0], request.fence);
>		goto unlink;
>	}
>
> @@ -771,8 +780,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>
>	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>	if (unlikely(ret < 0)) {
> -		CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n",
> -			 action[0], ERR_PTR(ret), status);
> +		if (ret != -ECANCELED)
> +			CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n",
> +				 action[0], ERR_PTR(ret), status);

I am wondering why we even have this print and should we just delete it or
convert it to CT_DEBUG(). The reason is that only error prints closest to
where the actual error occurs are useful since they pin-point the error
clearly. This to be seems to be a "second" print from a higher level
function which does not seem particularly useful.


>	} else if (unlikely(ret)) {
>		CT_DEBUG(ct, "send action %#x returned %d (%#x)\n",
>			 action[0], ret, ret);
> --
> 2.36.0
>
Ashutosh Dixit June 17, 2022, 4:50 a.m. UTC | #2
On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote:
>
> We are seeing error message of "No response for request". Some cases
> happened while waiting for response and reset/suspend action was triggered.
> In this case, no response is not an error, active requests will be
> cancelled.
>
> This patch will handle this condition and change the error message into
> debug message.
>
> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 ++++++++++++++++-------
>  1 file changed, 17 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index f01325cd1b62..f07a7666b1ad 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct,
>
>  /**
>   * wait_for_ct_request_update - Wait for CT request state update.
> + * @ct:		pointer to CT
>   * @req:	pointer to pending request
>   * @status:	placeholder for status
>   *
> @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct,
>   * *	0 response received (status is valid)
>   * *	-ETIMEDOUT no response within hardcoded timeout
>   */
> -static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status)
>  {
>	int err;
> +	bool ct_enabled;
>
>	/*
>	 * Fast commands should complete in less than 10us, so sample quickly
> @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>  #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10
>  #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000
>  #define done \
> -	(FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
> +	(!(ct_enabled = intel_guc_ct_enabled(ct)) || \
> +	 FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
>	 GUC_HXG_ORIGIN_GUC)
>	err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS);
>	if (err)
>		err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS);
>  #undef done
> +	if (!ct_enabled)
> +		err = -ECANCELED;

Actually here's an even simpler suggestion. We could just do:

	if (!ct_enabled)
		CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", ...);

And return 0 as before. This way we won't have to make any changes in
either ct_send() or intel_guc_ct_send(). So intel_guc_ct_enabled() just
serves to get us out of the wait early and prevent the -ETIMEDOUT return
(and 0 return avoids all the error messages we are trying to eliminate).
Ashutosh Dixit July 12, 2022, 7:47 p.m. UTC | #3
On Thu, 16 Jun 2022 21:50:55 -0700, Dixit, Ashutosh wrote:
>
> On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote:
> >
> > We are seeing error message of "No response for request". Some cases
> > happened while waiting for response and reset/suspend action was triggered.
> > In this case, no response is not an error, active requests will be
> > cancelled.
> >
> > This patch will handle this condition and change the error message into
> > debug message.
> >
> > Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
> > ---
> >  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 ++++++++++++++++-------
> >  1 file changed, 17 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > index f01325cd1b62..f07a7666b1ad 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct,
> >
> >  /**
> >   * wait_for_ct_request_update - Wait for CT request state update.
> > + * @ct:		pointer to CT
> >   * @req:	pointer to pending request
> >   * @status:	placeholder for status
> >   *
> > @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct,
> >   * *	0 response received (status is valid)
> >   * *	-ETIMEDOUT no response within hardcoded timeout
> >   */
> > -static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> > +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status)
> >  {
> >	int err;
> > +	bool ct_enabled;
> >
> >	/*
> >	 * Fast commands should complete in less than 10us, so sample quickly
> > @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> >  #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10
> >  #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000
> >  #define done \
> > -	(FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
> > +	(!(ct_enabled = intel_guc_ct_enabled(ct)) || \
> > +	 FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
> >	 GUC_HXG_ORIGIN_GUC)
> >	err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS);
> >	if (err)
> >		err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS);
> >  #undef done
> > +	if (!ct_enabled)
> > +		err = -ECANCELED;
>
> Actually here's an even simpler suggestion. We could just do:
>
>	if (!ct_enabled)
>		CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", ...);
>
> And return 0 as before. This way we won't have to make any changes in
> either ct_send() or intel_guc_ct_send(). So intel_guc_ct_enabled() just
> serves to get us out of the wait early and prevent the -ETIMEDOUT return
> (and 0 return avoids all the error messages we are trying to eliminate).

Actually will need to unlink the request too, so it will be something like:

	if (!ct_enabled) {
		CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", ...);

		spin_lock_irqsave(&ct->requests.lock, flags);
		list_del(&request.link);
		spin_unlock_irqrestore(&ct->requests.lock, flags);
	}
Zhanjun Dong July 13, 2022, 9:45 p.m. UTC | #4
> -----Original Message-----
> From: Dixit, Ashutosh <ashutosh.dixit@intel.com>
> Sent: July 12, 2022 3:48 PM
> To: Dong, Zhanjun <zhanjun.dong@intel.com>
> Cc: intel-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH] drm/i915/guc: Check for ct enabled while
> waiting for response
> 
> On Thu, 16 Jun 2022 21:50:55 -0700, Dixit, Ashutosh wrote:
> >
> > On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote:
> > >
> > > We are seeing error message of "No response for request". Some cases
> > > happened while waiting for response and reset/suspend action was
> triggered.
> > > In this case, no response is not an error, active requests will be
> > > cancelled.
> > >
> > > This patch will handle this condition and change the error message
> > > into debug message.
> > >
> > > Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24
> > > ++++++++++++++++-------
> > >  1 file changed, 17 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > index f01325cd1b62..f07a7666b1ad 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct,
> > >
> > >  /**
> > >   * wait_for_ct_request_update - Wait for CT request state update.
> > > + * @ct:		pointer to CT
> > >   * @req:	pointer to pending request
> > >   * @status:	placeholder for status
> > >   *
> > > @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct,
> > >   * *	0 response received (status is valid)
> > >   * *	-ETIMEDOUT no response within hardcoded timeout
> > >   */
> > > -static int wait_for_ct_request_update(struct ct_request *req, u32
> > > *status)
> > > +static int wait_for_ct_request_update(struct intel_guc_ct *ct,
> > > +struct ct_request *req, u32 *status)
> > >  {
> > >	int err;
> > > +	bool ct_enabled;
> > >
> > >	/*
> > >	 * Fast commands should complete in less than 10us, so sample
> > >quickly  @@ -481,12 +483,15 @@ static int
> > >wait_for_ct_request_update(struct ct_request *req, u32 *status)
> > >  #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10
> > >  #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000
> > >  #define done \
> > > -	(FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) ==
> \
> > > +	(!(ct_enabled = intel_guc_ct_enabled(ct)) || \
> > > +	 FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) ==
> \
> > >	 GUC_HXG_ORIGIN_GUC)
> > >	err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS);
> > >	if (err)
> > >		err = wait_for(done,
> GUC_CTB_RESPONSE_TIMEOUT_LONG_MS);
> > >  #undef done
> > > +	if (!ct_enabled)
> > > +		err = -ECANCELED;
> >
> > Actually here's an even simpler suggestion. We could just do:
> >
> >	if (!ct_enabled)
> >		CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is
> disabled\n",
> >...);
> >
> > And return 0 as before. This way we won't have to make any changes in
> > either ct_send() or intel_guc_ct_send(). So intel_guc_ct_enabled()
> > just serves to get us out of the wait early and prevent the -ETIMEDOUT
> > return (and 0 return avoids all the error messages we are trying to
> eliminate).
> 
> Actually will need to unlink the request too, so it will be something like:
> 
> 	if (!ct_enabled) {
> 		CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is
> disabled\n", ...);
> 
> 		spin_lock_irqsave(&ct->requests.lock, flags);
> 		list_del(&request.link);
> 		spin_unlock_irqrestore(&ct->requests.lock, flags);
> 	}

I agree, the caller function need the err is non-zero to know the request is not success, and unlink the request.
The caller function ct_send will do the unlink.

For the err code ECANCELED, while in intel_guc_ct_send, it returns ENODEV if ct is disabled. This patch will be changed to ENODEV to match it.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index f01325cd1b62..f07a7666b1ad 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -455,6 +455,7 @@  static int ct_write(struct intel_guc_ct *ct,
 
 /**
  * wait_for_ct_request_update - Wait for CT request state update.
+ * @ct:		pointer to CT
  * @req:	pointer to pending request
  * @status:	placeholder for status
  *
@@ -467,9 +468,10 @@  static int ct_write(struct intel_guc_ct *ct,
  * *	0 response received (status is valid)
  * *	-ETIMEDOUT no response within hardcoded timeout
  */
-static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
+static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status)
 {
 	int err;
+	bool ct_enabled;
 
 	/*
 	 * Fast commands should complete in less than 10us, so sample quickly
@@ -481,12 +483,15 @@  static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
 #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10
 #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000
 #define done \
-	(FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
+	(!(ct_enabled = intel_guc_ct_enabled(ct)) || \
+	 FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \
 	 GUC_HXG_ORIGIN_GUC)
 	err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS);
 	if (err)
 		err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS);
 #undef done
+	if (!ct_enabled)
+		err = -ECANCELED;
 
 	*status = req->status;
 	return err;
@@ -703,11 +708,15 @@  static int ct_send(struct intel_guc_ct *ct,
 
 	intel_guc_notify(ct_to_guc(ct));
 
-	err = wait_for_ct_request_update(&request, status);
+	err = wait_for_ct_request_update(ct, &request, status);
 	g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
 	if (unlikely(err)) {
-		CT_ERROR(ct, "No response for request %#x (fence %u)\n",
-			 action[0], request.fence);
+		if (err == -ECANCELED)
+			CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n",
+				 action[0], request.fence);
+		else
+			CT_ERROR(ct, "No response for request %#x (fence %u)\n",
+				 action[0], request.fence);
 		goto unlink;
 	}
 
@@ -771,8 +780,9 @@  int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
 
 	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
 	if (unlikely(ret < 0)) {
-		CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n",
-			 action[0], ERR_PTR(ret), status);
+		if (ret != -ECANCELED)
+			CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n",
+				 action[0], ERR_PTR(ret), status);
 	} else if (unlikely(ret)) {
 		CT_DEBUG(ct, "send action %#x returned %d (%#x)\n",
 			 action[0], ret, ret);