diff mbox series

[RFC,36/97] drm/i915/guc: Add non blocking CTB send function

Message ID 20210506191451.77768-37-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Basic GuC submission support in the i915 | expand

Commit Message

Matthew Brost May 6, 2021, 7:13 p.m. UTC
Add non blocking CTB send function, intel_guc_send_nb. In order to
support a non blocking CTB send function a spin lock is needed to
protect the CTB descriptors fields. Also the non blocking call must not
update the fence value as this value is owned by the blocking call
(intel_guc_send).

The blocking CTB now must have a flow control mechanism to ensure the
buffer isn't overrun. A lazy spin wait is used as we believe the flow
control condition should be rare with properly sized buffer.

The function, intel_guc_send_nb, is exported in this patch but unused.
Several patches later in the series make use of this function.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
 3 files changed, 105 insertions(+), 10 deletions(-)

Comments

Michal Wajdeczko May 24, 2021, 12:21 p.m. UTC | #1
On 06.05.2021 21:13, Matthew Brost wrote:
> Add non blocking CTB send function, intel_guc_send_nb. In order to
> support a non blocking CTB send function a spin lock is needed to

spin lock was added in 16/97

> protect the CTB descriptors fields. Also the non blocking call must not
> update the fence value as this value is owned by the blocking call
> (intel_guc_send).

all H2G messages are using "fence", nb variant also needs to update it

> 
> The blocking CTB now must have a flow control mechanism to ensure the

s/blocking/non-blocking

> buffer isn't overrun. A lazy spin wait is used as we believe the flow
> control condition should be rare with properly sized buffer.

as this new nb function is still not used in this patch, then maybe
better to move flow control to separate patch for easier review ?

> 
> The function, intel_guc_send_nb, is exported in this patch but unused.
> Several patches later in the series make use of this function.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
>  3 files changed, 105 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index c20f3839de12..4c0a367e41d8 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -75,7 +75,15 @@ static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>  static
>  inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
>  {
> -	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
> +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
> +}
> +
> +#define INTEL_GUC_SEND_NB		BIT(31)
> +static
> +inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
> +{
> +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
> +				 INTEL_GUC_SEND_NB);
>  }
>  
>  static inline int
> @@ -83,7 +91,7 @@ intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
>  			   u32 *response_buf, u32 response_buf_size)
>  {
>  	return intel_guc_ct_send(&guc->ct, action, len,
> -				 response_buf, response_buf_size);
> +				 response_buf, response_buf_size, 0);
>  }
>  
>  static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index a76603537fa8..af7314d45a78 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -3,6 +3,11 @@
>   * Copyright © 2016-2019 Intel Corporation
>   */
>  
> +#include <linux/circ_buf.h>
> +#include <linux/ktime.h>
> +#include <linux/time64.h>
> +#include <linux/timekeeping.h>
> +
>  #include "i915_drv.h"
>  #include "intel_guc_ct.h"
>  #include "gt/intel_gt.h"
> @@ -308,6 +313,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>  	if (unlikely(err))
>  		goto err_deregister;
>  
> +	ct->requests.last_fence = 1;

not needed

>  	ct->enabled = true;
>  
>  	return 0;
> @@ -343,10 +349,22 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
>  	return ++ct->requests.last_fence;
>  }
>  
> +static void write_barrier(struct intel_guc_ct *ct) {
> +	struct intel_guc *guc = ct_to_guc(ct);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +
> +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> +		GEM_BUG_ON(guc->send_regs.fw_domains);
> +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> +	} else {
> +		wmb();
> +	}
> +}

this chunk seems to be good candidate for separate patch that could be
introduced earlier

> +
>  static int ct_write(struct intel_guc_ct *ct,
>  		    const u32 *action,
>  		    u32 len /* in dwords */,
> -		    u32 fence)
> +		    u32 fence, u32 flags)
>  {
>  	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>  	struct guc_ct_buffer_desc *desc = ctb->desc;
> @@ -393,9 +411,13 @@ static int ct_write(struct intel_guc_ct *ct,
>  		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
>  		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
>  
> -	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> -	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> -			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
> +	hxg = (flags & INTEL_GUC_SEND_NB) ?
> +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> +		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
> +			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
> +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> +		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> +			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
>  
>  	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
>  		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
> @@ -412,6 +434,12 @@ static int ct_write(struct intel_guc_ct *ct,
>  	}
>  	GEM_BUG_ON(tail > size);
>  
> +	/*
> +	 * make sure H2G buffer update and LRC tail update (if this triggering a
> +	 * submission) are visable before updating the descriptor tail

typo

> +	 */
> +	write_barrier(ct);
> +
>  	/* now update descriptor */
>  	WRITE_ONCE(desc->tail, tail);
>  
> @@ -466,6 +494,46 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>  	return err;
>  }
>  
> +static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
> +{
> +	struct guc_ct_buffer_desc *desc = ctb->desc;
> +	u32 head = READ_ONCE(desc->head);
> +	u32 space;
> +
> +	space = CIRC_SPACE(desc->tail, head, ctb->size);

shouldn't we use READ_ONCE for reading the tail?

> +
> +	return space >= len_dw;
> +}
> +
> +static int ct_send_nb(struct intel_guc_ct *ct,
> +		      const u32 *action,
> +		      u32 len,
> +		      u32 flags)
> +{
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> +	unsigned long spin_flags;
> +	u32 fence;
> +	int ret;
> +
> +	spin_lock_irqsave(&ctb->lock, spin_flags);
> +
> +	ret = ctb_has_room(ctb, len + 1);

why +1 ?

> +	if (unlikely(ret))
> +		goto out;
> +
> +	fence = ct_get_next_fence(ct);
> +	ret = ct_write(ct, action, len, fence, flags);
> +	if (unlikely(ret))
> +		goto out;
> +
> +	intel_guc_notify(ct_to_guc(ct));
> +
> +out:
> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> +
> +	return ret;
> +}
> +
>  static int ct_send(struct intel_guc_ct *ct,
>  		   const u32 *action,
>  		   u32 len,
> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>  		   u32 response_buf_size,
>  		   u32 *status)
>  {
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>  	struct ct_request request;
>  	unsigned long flags;
>  	u32 fence;
> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>  	GEM_BUG_ON(!len);
>  	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>  	GEM_BUG_ON(!response_buf && response_buf_size);
> +	might_sleep();
>  
> +	/*
> +	 * We use a lazy spin wait loop here as we believe that if the CT
> +	 * buffers are sized correctly the flow control condition should be
> +	 * rare.
> +	 */
> +retry:
>  	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {

why +1 ?

> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> +		cond_resched();
> +		goto retry;
> +	}

hmm, full CTB can also be seen in case of nb, but it looks that only in
case of blocking call you want to use lazy spin, why ?

also, what if situation is not improving ?
will we be looping here forever ?

>  
>  	fence = ct_get_next_fence(ct);
>  	request.fence = fence;
> @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
>  	list_add_tail(&request.link, &ct->requests.pending);
>  	spin_unlock(&ct->requests.lock);
>  
> -	err = ct_write(ct, action, len, fence);
> +	err = ct_write(ct, action, len, fence, 0);
>  
>  	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>  
> @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
>   * Command Transport (CT) buffer based GuC send function.
>   */
>  int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> -		      u32 *response_buf, u32 response_buf_size)
> +		      u32 *response_buf, u32 response_buf_size, u32 flags)
>  {
>  	u32 status = ~0; /* undefined */
>  	int ret;
> @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>  		return -ENODEV;
>  	}
>  
> +	if (flags & INTEL_GUC_SEND_NB)
> +		return ct_send_nb(ct, action, len, flags);
> +
>  	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>  	if (unlikely(ret < 0)) {
>  		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> index 1ae2dde6db93..55ef7c52472f 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> @@ -9,6 +9,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/spinlock.h>
>  #include <linux/workqueue.h>
> +#include <linux/ktime.h>
>  
>  #include "intel_guc_fwif.h"
>  
> @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
>  	bool broken;
>  };
>  
> -
>  /** Top-level structure for Command Transport related data
>   *
>   * Includes a pair of CT buffers for bi-directional communication and tracking
> @@ -69,6 +69,9 @@ struct intel_guc_ct {
>  		struct list_head incoming; /* incoming requests */
>  		struct work_struct worker; /* handler for incoming requests */
>  	} requests;
> +
> +	/** @stall_time: time of first time a CTB submission is stalled */
> +	ktime_t stall_time;

this should be introduced in 37/97

>  };
>  
>  void intel_guc_ct_init_early(struct intel_guc_ct *ct);
> @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
>  }
>  
>  int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> -		      u32 *response_buf, u32 response_buf_size);
> +		      u32 *response_buf, u32 response_buf_size, u32 flags);
>  void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
>  
>  #endif /* _INTEL_GUC_CT_H_ */
>
Tvrtko Ursulin May 25, 2021, 9:21 a.m. UTC | #2
On 06/05/2021 20:13, Matthew Brost wrote:
> Add non blocking CTB send function, intel_guc_send_nb. In order to
> support a non blocking CTB send function a spin lock is needed to
> protect the CTB descriptors fields. Also the non blocking call must not
> update the fence value as this value is owned by the blocking call
> (intel_guc_send).

Could the commit message say why the non-blocking send function is needed?

> 
> The blocking CTB now must have a flow control mechanism to ensure the
> buffer isn't overrun. A lazy spin wait is used as we believe the flow
> control condition should be rare with properly sized buffer.
> 
> The function, intel_guc_send_nb, is exported in this patch but unused.
> Several patches later in the series make use of this function.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
>   3 files changed, 105 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index c20f3839de12..4c0a367e41d8 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -75,7 +75,15 @@ static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>   static
>   inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
>   {
> -	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
> +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
> +}
> +
> +#define INTEL_GUC_SEND_NB		BIT(31)
> +static
> +inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
> +{
> +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
> +				 INTEL_GUC_SEND_NB);
>   }
>   
>   static inline int
> @@ -83,7 +91,7 @@ intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
>   			   u32 *response_buf, u32 response_buf_size)
>   {
>   	return intel_guc_ct_send(&guc->ct, action, len,
> -				 response_buf, response_buf_size);
> +				 response_buf, response_buf_size, 0);
>   }
>   
>   static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index a76603537fa8..af7314d45a78 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -3,6 +3,11 @@
>    * Copyright © 2016-2019 Intel Corporation
>    */
>   
> +#include <linux/circ_buf.h>
> +#include <linux/ktime.h>
> +#include <linux/time64.h>
> +#include <linux/timekeeping.h>
> +
>   #include "i915_drv.h"
>   #include "intel_guc_ct.h"
>   #include "gt/intel_gt.h"
> @@ -308,6 +313,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>   	if (unlikely(err))
>   		goto err_deregister;
>   
> +	ct->requests.last_fence = 1;
>   	ct->enabled = true;
>   
>   	return 0;
> @@ -343,10 +349,22 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
>   	return ++ct->requests.last_fence;
>   }
>   
> +static void write_barrier(struct intel_guc_ct *ct) {
> +	struct intel_guc *guc = ct_to_guc(ct);
> +	struct intel_gt *gt = guc_to_gt(guc);
> +
> +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> +		GEM_BUG_ON(guc->send_regs.fw_domains);
> +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);

It's safe to write to this reg? Does it need a comment to explain it?

> +	} else {
> +		wmb();
> +	}
> +}
> +
>   static int ct_write(struct intel_guc_ct *ct,
>   		    const u32 *action,
>   		    u32 len /* in dwords */,
> -		    u32 fence)
> +		    u32 fence, u32 flags)
>   {
>   	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>   	struct guc_ct_buffer_desc *desc = ctb->desc;
> @@ -393,9 +411,13 @@ static int ct_write(struct intel_guc_ct *ct,
>   		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
>   		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
>   
> -	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> -	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> -			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
> +	hxg = (flags & INTEL_GUC_SEND_NB) ?
> +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> +		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
> +			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
> +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> +		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> +			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
>   
>   	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
>   		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
> @@ -412,6 +434,12 @@ static int ct_write(struct intel_guc_ct *ct,
>   	}
>   	GEM_BUG_ON(tail > size);
>   
> +	/*
> +	 * make sure H2G buffer update and LRC tail update (if this triggering a
> +	 * submission) are visable before updating the descriptor tail
> +	 */
> +	write_barrier(ct);
> +
>   	/* now update descriptor */
>   	WRITE_ONCE(desc->tail, tail);
>   
> @@ -466,6 +494,46 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>   	return err;
>   }
>   
> +static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
> +{
> +	struct guc_ct_buffer_desc *desc = ctb->desc;
> +	u32 head = READ_ONCE(desc->head);
> +	u32 space;
> +
> +	space = CIRC_SPACE(desc->tail, head, ctb->size);
> +
> +	return space >= len_dw;
> +}
> +
> +static int ct_send_nb(struct intel_guc_ct *ct,
> +		      const u32 *action,
> +		      u32 len,
> +		      u32 flags)
> +{
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> +	unsigned long spin_flags;
> +	u32 fence;
> +	int ret;
> +
> +	spin_lock_irqsave(&ctb->lock, spin_flags);
> +
> +	ret = ctb_has_room(ctb, len + 1);
> +	if (unlikely(ret))
> +		goto out;
> +
> +	fence = ct_get_next_fence(ct);
> +	ret = ct_write(ct, action, len, fence, flags);
> +	if (unlikely(ret))
> +		goto out;
> +
> +	intel_guc_notify(ct_to_guc(ct));
> +
> +out:
> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> +
> +	return ret;
> +}
> +
>   static int ct_send(struct intel_guc_ct *ct,
>   		   const u32 *action,
>   		   u32 len,
> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>   		   u32 response_buf_size,
>   		   u32 *status)
>   {
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>   	struct ct_request request;
>   	unsigned long flags;
>   	u32 fence;
> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>   	GEM_BUG_ON(!len);
>   	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>   	GEM_BUG_ON(!response_buf && response_buf_size);
> +	might_sleep();

Sleep is just cond_resched below or there is more?

>   
> +	/*
> +	 * We use a lazy spin wait loop here as we believe that if the CT
> +	 * buffers are sized correctly the flow control condition should be
> +	 * rare.
> +	 */
> +retry:
>   	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> +		cond_resched();
> +		goto retry;
> +	}

If this patch is about adding a non-blocking send function, and below we 
can see that it creates a fork:

intel_guc_ct_send:
...
	if (flags & INTEL_GUC_SEND_NB)
		return ct_send_nb(ct, action, len, flags);

  	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);

Then why is there a change in ct_send here, which is not the new 
non-blocking path?

>   
>   	fence = ct_get_next_fence(ct);
>   	request.fence = fence;
> @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
>   	list_add_tail(&request.link, &ct->requests.pending);
>   	spin_unlock(&ct->requests.lock);
>   
> -	err = ct_write(ct, action, len, fence);
> +	err = ct_write(ct, action, len, fence, 0);
>   
>   	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>   
> @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
>    * Command Transport (CT) buffer based GuC send function.
>    */
>   int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> -		      u32 *response_buf, u32 response_buf_size)
> +		      u32 *response_buf, u32 response_buf_size, u32 flags)
>   {
>   	u32 status = ~0; /* undefined */
>   	int ret;
> @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>   		return -ENODEV;
>   	}
>   
> +	if (flags & INTEL_GUC_SEND_NB)
> +		return ct_send_nb(ct, action, len, flags);
> +
>   	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>   	if (unlikely(ret < 0)) {
>   		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> index 1ae2dde6db93..55ef7c52472f 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> @@ -9,6 +9,7 @@
>   #include <linux/interrupt.h>
>   #include <linux/spinlock.h>
>   #include <linux/workqueue.h>
> +#include <linux/ktime.h>
>   
>   #include "intel_guc_fwif.h"
>   
> @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
>   	bool broken;
>   };
>   
> -
>   /** Top-level structure for Command Transport related data
>    *
>    * Includes a pair of CT buffers for bi-directional communication and tracking
> @@ -69,6 +69,9 @@ struct intel_guc_ct {
>   		struct list_head incoming; /* incoming requests */
>   		struct work_struct worker; /* handler for incoming requests */
>   	} requests;
> +
> +	/** @stall_time: time of first time a CTB submission is stalled */
> +	ktime_t stall_time;

Unused in this patch.

>   };
>   
>   void intel_guc_ct_init_early(struct intel_guc_ct *ct);
> @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
>   }
>   
>   int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> -		      u32 *response_buf, u32 response_buf_size);
> +		      u32 *response_buf, u32 response_buf_size, u32 flags);
>   void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
>   
>   #endif /* _INTEL_GUC_CT_H_ */
> 

Regards,

Tvrtko
Matthew Brost May 25, 2021, 5:21 p.m. UTC | #3
On Tue, May 25, 2021 at 10:21:00AM +0100, Tvrtko Ursulin wrote:
> 
> On 06/05/2021 20:13, Matthew Brost wrote:
> > Add non blocking CTB send function, intel_guc_send_nb. In order to
> > support a non blocking CTB send function a spin lock is needed to
> > protect the CTB descriptors fields. Also the non blocking call must not
> > update the fence value as this value is owned by the blocking call
> > (intel_guc_send).
> 
> Could the commit message say why the non-blocking send function is needed?
> 

Sure. Something like:

'CTBs will be used in the critical patch of GuC submission and there is
no need to wait for each CTB complete before moving on the i915'

> > 
> > The blocking CTB now must have a flow control mechanism to ensure the
> > buffer isn't overrun. A lazy spin wait is used as we believe the flow
> > control condition should be rare with properly sized buffer.
> > 
> > The function, intel_guc_send_nb, is exported in this patch but unused.
> > Several patches later in the series make use of this function.
> > 
> > Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
> >   3 files changed, 105 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > index c20f3839de12..4c0a367e41d8 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > @@ -75,7 +75,15 @@ static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> >   static
> >   inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
> >   {
> > -	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
> > +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
> > +}
> > +
> > +#define INTEL_GUC_SEND_NB		BIT(31)
> > +static
> > +inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
> > +{
> > +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
> > +				 INTEL_GUC_SEND_NB);
> >   }
> >   static inline int
> > @@ -83,7 +91,7 @@ intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
> >   			   u32 *response_buf, u32 response_buf_size)
> >   {
> >   	return intel_guc_ct_send(&guc->ct, action, len,
> > -				 response_buf, response_buf_size);
> > +				 response_buf, response_buf_size, 0);
> >   }
> >   static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > index a76603537fa8..af7314d45a78 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > @@ -3,6 +3,11 @@
> >    * Copyright © 2016-2019 Intel Corporation
> >    */
> > +#include <linux/circ_buf.h>
> > +#include <linux/ktime.h>
> > +#include <linux/time64.h>
> > +#include <linux/timekeeping.h>
> > +
> >   #include "i915_drv.h"
> >   #include "intel_guc_ct.h"
> >   #include "gt/intel_gt.h"
> > @@ -308,6 +313,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
> >   	if (unlikely(err))
> >   		goto err_deregister;
> > +	ct->requests.last_fence = 1;
> >   	ct->enabled = true;
> >   	return 0;
> > @@ -343,10 +349,22 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
> >   	return ++ct->requests.last_fence;
> >   }
> > +static void write_barrier(struct intel_guc_ct *ct) {
> > +	struct intel_guc *guc = ct_to_guc(ct);
> > +	struct intel_gt *gt = guc_to_gt(guc);
> > +
> > +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > +		GEM_BUG_ON(guc->send_regs.fw_domains);
> > +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> 
> It's safe to write to this reg? Does it need a comment to explain it?
>

Yes, it is same. IMO 'SCRATCH' in the name is enough documentation.
 
> > +	} else {
> > +		wmb();
> > +	}
> > +}
> > +
> >   static int ct_write(struct intel_guc_ct *ct,
> >   		    const u32 *action,
> >   		    u32 len /* in dwords */,
> > -		    u32 fence)
> > +		    u32 fence, u32 flags)
> >   {
> >   	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >   	struct guc_ct_buffer_desc *desc = ctb->desc;
> > @@ -393,9 +411,13 @@ static int ct_write(struct intel_guc_ct *ct,
> >   		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
> >   		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
> > -	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> > -	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> > -			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
> > +	hxg = (flags & INTEL_GUC_SEND_NB) ?
> > +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> > +		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
> > +			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
> > +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> > +		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> > +			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
> >   	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
> >   		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
> > @@ -412,6 +434,12 @@ static int ct_write(struct intel_guc_ct *ct,
> >   	}
> >   	GEM_BUG_ON(tail > size);
> > +	/*
> > +	 * make sure H2G buffer update and LRC tail update (if this triggering a
> > +	 * submission) are visable before updating the descriptor tail
> > +	 */
> > +	write_barrier(ct);
> > +
> >   	/* now update descriptor */
> >   	WRITE_ONCE(desc->tail, tail);
> > @@ -466,6 +494,46 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> >   	return err;
> >   }
> > +static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
> > +{
> > +	struct guc_ct_buffer_desc *desc = ctb->desc;
> > +	u32 head = READ_ONCE(desc->head);
> > +	u32 space;
> > +
> > +	space = CIRC_SPACE(desc->tail, head, ctb->size);
> > +
> > +	return space >= len_dw;
> > +}
> > +
> > +static int ct_send_nb(struct intel_guc_ct *ct,
> > +		      const u32 *action,
> > +		      u32 len,
> > +		      u32 flags)
> > +{
> > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > +	unsigned long spin_flags;
> > +	u32 fence;
> > +	int ret;
> > +
> > +	spin_lock_irqsave(&ctb->lock, spin_flags);
> > +
> > +	ret = ctb_has_room(ctb, len + 1);
> > +	if (unlikely(ret))
> > +		goto out;
> > +
> > +	fence = ct_get_next_fence(ct);
> > +	ret = ct_write(ct, action, len, fence, flags);
> > +	if (unlikely(ret))
> > +		goto out;
> > +
> > +	intel_guc_notify(ct_to_guc(ct));
> > +
> > +out:
> > +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > +
> > +	return ret;
> > +}
> > +
> >   static int ct_send(struct intel_guc_ct *ct,
> >   		   const u32 *action,
> >   		   u32 len,
> > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >   		   u32 response_buf_size,
> >   		   u32 *status)
> >   {
> > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >   	struct ct_request request;
> >   	unsigned long flags;
> >   	u32 fence;
> > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >   	GEM_BUG_ON(!len);
> >   	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >   	GEM_BUG_ON(!response_buf && response_buf_size);
> > +	might_sleep();
> 
> Sleep is just cond_resched below or there is more?
> 

Yes, the cond_resched.

> > +	/*
> > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > +	 * buffers are sized correctly the flow control condition should be
> > +	 * rare.
> > +	 */
> > +retry:
> >   	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > +		cond_resched();
> > +		goto retry;
> > +	}
> 
> If this patch is about adding a non-blocking send function, and below we can
> see that it creates a fork:
> 
> intel_guc_ct_send:
> ...
> 	if (flags & INTEL_GUC_SEND_NB)
> 		return ct_send_nb(ct, action, len, flags);
> 
>  	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> 
> Then why is there a change in ct_send here, which is not the new
> non-blocking path?
>

There is not a change to ct_send(), just to intel_guc_ct_send.

As for why intel_guc_ct_send is updated and we don't just a new public
function, this was another reviewers suggestion. Again can't make
everyone happy.
 
> >   	fence = ct_get_next_fence(ct);
> >   	request.fence = fence;
> > @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >   	list_add_tail(&request.link, &ct->requests.pending);
> >   	spin_unlock(&ct->requests.lock);
> > -	err = ct_write(ct, action, len, fence);
> > +	err = ct_write(ct, action, len, fence, 0);
> >   	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >    * Command Transport (CT) buffer based GuC send function.
> >    */
> >   int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > -		      u32 *response_buf, u32 response_buf_size)
> > +		      u32 *response_buf, u32 response_buf_size, u32 flags)
> >   {
> >   	u32 status = ~0; /* undefined */
> >   	int ret;
> > @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> >   		return -ENODEV;
> >   	}
> > +	if (flags & INTEL_GUC_SEND_NB)
> > +		return ct_send_nb(ct, action, len, flags);
> > +
> >   	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> >   	if (unlikely(ret < 0)) {
> >   		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > index 1ae2dde6db93..55ef7c52472f 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > @@ -9,6 +9,7 @@
> >   #include <linux/interrupt.h>
> >   #include <linux/spinlock.h>
> >   #include <linux/workqueue.h>
> > +#include <linux/ktime.h>
> >   #include "intel_guc_fwif.h"
> > @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
> >   	bool broken;
> >   };
> > -
> >   /** Top-level structure for Command Transport related data
> >    *
> >    * Includes a pair of CT buffers for bi-directional communication and tracking
> > @@ -69,6 +69,9 @@ struct intel_guc_ct {
> >   		struct list_head incoming; /* incoming requests */
> >   		struct work_struct worker; /* handler for incoming requests */
> >   	} requests;
> > +
> > +	/** @stall_time: time of first time a CTB submission is stalled */
> > +	ktime_t stall_time;
> 
> Unused in this patch.
>

Yea, wrong patch. Will fix.

Matt
 
> >   };
> >   void intel_guc_ct_init_early(struct intel_guc_ct *ct);
> > @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
> >   }
> >   int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > -		      u32 *response_buf, u32 response_buf_size);
> > +		      u32 *response_buf, u32 response_buf_size, u32 flags);
> >   void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
> >   #endif /* _INTEL_GUC_CT_H_ */
> > 
> 
> Regards,
> 
> Tvrtko
Matthew Brost May 25, 2021, 5:30 p.m. UTC | #4
On Mon, May 24, 2021 at 02:21:42PM +0200, Michal Wajdeczko wrote:
> 
> 
> On 06.05.2021 21:13, Matthew Brost wrote:
> > Add non blocking CTB send function, intel_guc_send_nb. In order to
> > support a non blocking CTB send function a spin lock is needed to
> 
> spin lock was added in 16/97
> 
> > protect the CTB descriptors fields. Also the non blocking call must not
> > update the fence value as this value is owned by the blocking call
> > (intel_guc_send).
> 
> all H2G messages are using "fence", nb variant also needs to update it
> 
> > 
> > The blocking CTB now must have a flow control mechanism to ensure the
> 
> s/blocking/non-blocking
> 

Will fix the comments as these are a bit stale.

> > buffer isn't overrun. A lazy spin wait is used as we believe the flow
> > control condition should be rare with properly sized buffer.
> 
> as this new nb function is still not used in this patch, then maybe
> better to move flow control to separate patch for easier review ?
>

You can't do non-blocking without flow control, it just doesn't work.
IMO that makes the review harder.
 
> > 
> > The function, intel_guc_send_nb, is exported in this patch but unused.
> > Several patches later in the series make use of this function.
> > 
> > Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
> >  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
> >  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
> >  3 files changed, 105 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > index c20f3839de12..4c0a367e41d8 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > @@ -75,7 +75,15 @@ static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> >  static
> >  inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
> >  {
> > -	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
> > +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
> > +}
> > +
> > +#define INTEL_GUC_SEND_NB		BIT(31)
> > +static
> > +inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
> > +{
> > +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
> > +				 INTEL_GUC_SEND_NB);
> >  }
> >  
> >  static inline int
> > @@ -83,7 +91,7 @@ intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
> >  			   u32 *response_buf, u32 response_buf_size)
> >  {
> >  	return intel_guc_ct_send(&guc->ct, action, len,
> > -				 response_buf, response_buf_size);
> > +				 response_buf, response_buf_size, 0);
> >  }
> >  
> >  static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > index a76603537fa8..af7314d45a78 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > @@ -3,6 +3,11 @@
> >   * Copyright © 2016-2019 Intel Corporation
> >   */
> >  
> > +#include <linux/circ_buf.h>
> > +#include <linux/ktime.h>
> > +#include <linux/time64.h>
> > +#include <linux/timekeeping.h>
> > +
> >  #include "i915_drv.h"
> >  #include "intel_guc_ct.h"
> >  #include "gt/intel_gt.h"
> > @@ -308,6 +313,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
> >  	if (unlikely(err))
> >  		goto err_deregister;
> >  
> > +	ct->requests.last_fence = 1;
> 
> not needed
>

Yep.
 
> >  	ct->enabled = true;
> >  
> >  	return 0;
> > @@ -343,10 +349,22 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
> >  	return ++ct->requests.last_fence;
> >  }
> >  
> > +static void write_barrier(struct intel_guc_ct *ct) {
> > +	struct intel_guc *guc = ct_to_guc(ct);
> > +	struct intel_gt *gt = guc_to_gt(guc);
> > +
> > +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > +		GEM_BUG_ON(guc->send_regs.fw_domains);
> > +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> > +	} else {
> > +		wmb();
> > +	}
> > +}
> 
> this chunk seems to be good candidate for separate patch that could be
> introduced earlier
>

Yes. Will include patches 3-20 post as this technically is required once
the mutex is removed.
 
> > +
> >  static int ct_write(struct intel_guc_ct *ct,
> >  		    const u32 *action,
> >  		    u32 len /* in dwords */,
> > -		    u32 fence)
> > +		    u32 fence, u32 flags)
> >  {
> >  	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >  	struct guc_ct_buffer_desc *desc = ctb->desc;
> > @@ -393,9 +411,13 @@ static int ct_write(struct intel_guc_ct *ct,
> >  		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
> >  		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
> >  
> > -	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> > -	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> > -			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
> > +	hxg = (flags & INTEL_GUC_SEND_NB) ?
> > +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> > +		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
> > +			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
> > +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> > +		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> > +			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
> >  
> >  	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
> >  		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
> > @@ -412,6 +434,12 @@ static int ct_write(struct intel_guc_ct *ct,
> >  	}
> >  	GEM_BUG_ON(tail > size);
> >  
> > +	/*
> > +	 * make sure H2G buffer update and LRC tail update (if this triggering a
> > +	 * submission) are visable before updating the descriptor tail
> 
> typo
> 
> > +	 */
> > +	write_barrier(ct);
> > +
> >  	/* now update descriptor */
> >  	WRITE_ONCE(desc->tail, tail);
> >  
> > @@ -466,6 +494,46 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> >  	return err;
> >  }
> >  
> > +static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
> > +{
> > +	struct guc_ct_buffer_desc *desc = ctb->desc;
> > +	u32 head = READ_ONCE(desc->head);
> > +	u32 space;
> > +
> > +	space = CIRC_SPACE(desc->tail, head, ctb->size);
> 
> shouldn't we use READ_ONCE for reading the tail?
>

I don't think so. The above READ_ONCE should be sufficient as a barrier.
 
> > +
> > +	return space >= len_dw;
> > +}
> > +
> > +static int ct_send_nb(struct intel_guc_ct *ct,
> > +		      const u32 *action,
> > +		      u32 len,
> > +		      u32 flags)
> > +{
> > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > +	unsigned long spin_flags;
> > +	u32 fence;
> > +	int ret;
> > +
> > +	spin_lock_irqsave(&ctb->lock, spin_flags);
> > +
> > +	ret = ctb_has_room(ctb, len + 1);
> 
> why +1 ?

The header is 2 DWs while the action array only include 1 DW for the
action field which is stuffed into the header.

> 
> > +	if (unlikely(ret))
> > +		goto out;
> > +
> > +	fence = ct_get_next_fence(ct);
> > +	ret = ct_write(ct, action, len, fence, flags);
> > +	if (unlikely(ret))
> > +		goto out;
> > +
> > +	intel_guc_notify(ct_to_guc(ct));
> > +
> > +out:
> > +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > +
> > +	return ret;
> > +}
> > +
> >  static int ct_send(struct intel_guc_ct *ct,
> >  		   const u32 *action,
> >  		   u32 len,
> > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >  		   u32 response_buf_size,
> >  		   u32 *status)
> >  {
> > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >  	struct ct_request request;
> >  	unsigned long flags;
> >  	u32 fence;
> > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >  	GEM_BUG_ON(!len);
> >  	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >  	GEM_BUG_ON(!response_buf && response_buf_size);
> > +	might_sleep();
> >  
> > +	/*
> > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > +	 * buffers are sized correctly the flow control condition should be
> > +	 * rare.
> > +	 */
> > +retry:
> >  	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> 
> why +1 ?
>

Same as above.

> > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > +		cond_resched();
> > +		goto retry;
> > +	}
> 
> hmm, full CTB can also be seen in case of nb, but it looks that only in
> case of blocking call you want to use lazy spin, why ?
>

Blocking calls are rare + no having credits is rare. No need to over
engineering the wait.
 
> also, what if situation is not improving ?
> will we be looping here forever ?
>

Nope, see the following patch:
https://patchwork.freedesktop.org/patch/432325/?series=89844&rev=1
 
> >  
> >  	fence = ct_get_next_fence(ct);
> >  	request.fence = fence;
> > @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >  	list_add_tail(&request.link, &ct->requests.pending);
> >  	spin_unlock(&ct->requests.lock);
> >  
> > -	err = ct_write(ct, action, len, fence);
> > +	err = ct_write(ct, action, len, fence, 0);
> >  
> >  	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >  
> > @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >   * Command Transport (CT) buffer based GuC send function.
> >   */
> >  int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > -		      u32 *response_buf, u32 response_buf_size)
> > +		      u32 *response_buf, u32 response_buf_size, u32 flags)
> >  {
> >  	u32 status = ~0; /* undefined */
> >  	int ret;
> > @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> >  		return -ENODEV;
> >  	}
> >  
> > +	if (flags & INTEL_GUC_SEND_NB)
> > +		return ct_send_nb(ct, action, len, flags);
> > +
> >  	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> >  	if (unlikely(ret < 0)) {
> >  		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > index 1ae2dde6db93..55ef7c52472f 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > @@ -9,6 +9,7 @@
> >  #include <linux/interrupt.h>
> >  #include <linux/spinlock.h>
> >  #include <linux/workqueue.h>
> > +#include <linux/ktime.h>
> >  
> >  #include "intel_guc_fwif.h"
> >  
> > @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
> >  	bool broken;
> >  };
> >  
> > -
> >  /** Top-level structure for Command Transport related data
> >   *
> >   * Includes a pair of CT buffers for bi-directional communication and tracking
> > @@ -69,6 +69,9 @@ struct intel_guc_ct {
> >  		struct list_head incoming; /* incoming requests */
> >  		struct work_struct worker; /* handler for incoming requests */
> >  	} requests;
> > +
> > +	/** @stall_time: time of first time a CTB submission is stalled */
> > +	ktime_t stall_time;
> 
> this should be introduced in 37/97
>

Yep.

Matt
 
> >  };
> >  
> >  void intel_guc_ct_init_early(struct intel_guc_ct *ct);
> > @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
> >  }
> >  
> >  int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > -		      u32 *response_buf, u32 response_buf_size);
> > +		      u32 *response_buf, u32 response_buf_size, u32 flags);
> >  void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
> >  
> >  #endif /* _INTEL_GUC_CT_H_ */
> >
Tvrtko Ursulin May 26, 2021, 8:57 a.m. UTC | #5
On 25/05/2021 18:21, Matthew Brost wrote:
> On Tue, May 25, 2021 at 10:21:00AM +0100, Tvrtko Ursulin wrote:
>>
>> On 06/05/2021 20:13, Matthew Brost wrote:
>>> Add non blocking CTB send function, intel_guc_send_nb. In order to
>>> support a non blocking CTB send function a spin lock is needed to
>>> protect the CTB descriptors fields. Also the non blocking call must not
>>> update the fence value as this value is owned by the blocking call
>>> (intel_guc_send).
>>
>> Could the commit message say why the non-blocking send function is needed?
>>
> 
> Sure. Something like:
> 
> 'CTBs will be used in the critical patch of GuC submission and there is
> no need to wait for each CTB complete before moving on the i915'

A bit more, like also mentioning the critical path is with interrupts disabled or so. And not just that there is no need to wait but waiting is not possible because this or that. So only choice is to do this busy loop send. It's a bit horrible so justification needs to be documented.

>>> The blocking CTB now must have a flow control mechanism to ensure the
>>> buffer isn't overrun. A lazy spin wait is used as we believe the flow
>>> control condition should be rare with properly sized buffer.
>>>
>>> The function, intel_guc_send_nb, is exported in this patch but unused.
>>> Several patches later in the series make use of this function.
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
>>>    drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
>>>    drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
>>>    3 files changed, 105 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> index c20f3839de12..4c0a367e41d8 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> @@ -75,7 +75,15 @@ static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
>>>    static
>>>    inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
>>>    {
>>> -	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
>>> +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
>>> +}
>>> +
>>> +#define INTEL_GUC_SEND_NB		BIT(31)
>>> +static
>>> +inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
>>> +{
>>> +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
>>> +				 INTEL_GUC_SEND_NB);
>>>    }
>>>    static inline int
>>> @@ -83,7 +91,7 @@ intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
>>>    			   u32 *response_buf, u32 response_buf_size)
>>>    {
>>>    	return intel_guc_ct_send(&guc->ct, action, len,
>>> -				 response_buf, response_buf_size);
>>> +				 response_buf, response_buf_size, 0);
>>>    }
>>>    static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> index a76603537fa8..af7314d45a78 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> @@ -3,6 +3,11 @@
>>>     * Copyright © 2016-2019 Intel Corporation
>>>     */
>>> +#include <linux/circ_buf.h>
>>> +#include <linux/ktime.h>
>>> +#include <linux/time64.h>
>>> +#include <linux/timekeeping.h>
>>> +
>>>    #include "i915_drv.h"
>>>    #include "intel_guc_ct.h"
>>>    #include "gt/intel_gt.h"
>>> @@ -308,6 +313,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>>>    	if (unlikely(err))
>>>    		goto err_deregister;
>>> +	ct->requests.last_fence = 1;
>>>    	ct->enabled = true;
>>>    	return 0;
>>> @@ -343,10 +349,22 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
>>>    	return ++ct->requests.last_fence;
>>>    }
>>> +static void write_barrier(struct intel_guc_ct *ct) {
>>> +	struct intel_guc *guc = ct_to_guc(ct);
>>> +	struct intel_gt *gt = guc_to_gt(guc);
>>> +
>>> +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
>>> +		GEM_BUG_ON(guc->send_regs.fw_domains);
>>> +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
>>
>> It's safe to write to this reg? Does it need a comment to explain it?
>>
> 
> Yes, it is same. IMO 'SCRATCH' in the name is enough documentation.

Why would it be enough? It requires digging to figure it out since it appears these are some sort of GuC special registers and not generic scratch:

commit 2d4ed3a988e6b1ff9729d0edd74bf4890571253e
Author: Michal Wajdeczko <michal.wajdeczko@intel.com>
Date:   Mon May 27 18:36:05 2019 +0000

     drm/i915/guc: New GuC scratch registers for Gen11

If it was a normal scratch then async trashing of those from a random driver thread isn't per se safe if used from a GPU context running in parallel.

But then according to bspec they are called VF_SW_FLAG_<n> and not GEN11_SOFT_SCRATCH so yeah.

>   
>>> +	} else {
>>> +		wmb();
>>> +	}
>>> +}
>>> +
>>>    static int ct_write(struct intel_guc_ct *ct,
>>>    		    const u32 *action,
>>>    		    u32 len /* in dwords */,
>>> -		    u32 fence)
>>> +		    u32 fence, u32 flags)
>>>    {
>>>    	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>    	struct guc_ct_buffer_desc *desc = ctb->desc;
>>> @@ -393,9 +411,13 @@ static int ct_write(struct intel_guc_ct *ct,
>>>    		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
>>>    		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
>>> -	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
>>> -	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
>>> -			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
>>> +	hxg = (flags & INTEL_GUC_SEND_NB) ?
>>> +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
>>> +		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
>>> +			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
>>> +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
>>> +		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
>>> +			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
>>>    	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
>>>    		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
>>> @@ -412,6 +434,12 @@ static int ct_write(struct intel_guc_ct *ct,
>>>    	}
>>>    	GEM_BUG_ON(tail > size);
>>> +	/*
>>> +	 * make sure H2G buffer update and LRC tail update (if this triggering a
>>> +	 * submission) are visable before updating the descriptor tail
>>> +	 */
>>> +	write_barrier(ct);
>>> +
>>>    	/* now update descriptor */
>>>    	WRITE_ONCE(desc->tail, tail);
>>> @@ -466,6 +494,46 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>>>    	return err;
>>>    }
>>> +static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>>> +{
>>> +	struct guc_ct_buffer_desc *desc = ctb->desc;
>>> +	u32 head = READ_ONCE(desc->head);
>>> +	u32 space;
>>> +
>>> +	space = CIRC_SPACE(desc->tail, head, ctb->size);
>>> +
>>> +	return space >= len_dw;
>>> +}
>>> +
>>> +static int ct_send_nb(struct intel_guc_ct *ct,
>>> +		      const u32 *action,
>>> +		      u32 len,
>>> +		      u32 flags)
>>> +{
>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>> +	unsigned long spin_flags;
>>> +	u32 fence;
>>> +	int ret;
>>> +
>>> +	spin_lock_irqsave(&ctb->lock, spin_flags);
>>> +
>>> +	ret = ctb_has_room(ctb, len + 1);
>>> +	if (unlikely(ret))
>>> +		goto out;
>>> +
>>> +	fence = ct_get_next_fence(ct);
>>> +	ret = ct_write(ct, action, len, fence, flags);
>>> +	if (unlikely(ret))
>>> +		goto out;
>>> +
>>> +	intel_guc_notify(ct_to_guc(ct));
>>> +
>>> +out:
>>> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>>    static int ct_send(struct intel_guc_ct *ct,
>>>    		   const u32 *action,
>>>    		   u32 len,
>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>    		   u32 response_buf_size,
>>>    		   u32 *status)
>>>    {
>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>    	struct ct_request request;
>>>    	unsigned long flags;
>>>    	u32 fence;
>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>    	GEM_BUG_ON(!len);
>>>    	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>    	GEM_BUG_ON(!response_buf && response_buf_size);
>>> +	might_sleep();
>>
>> Sleep is just cond_resched below or there is more?
>>
> 
> Yes, the cond_resched.
> 
>>> +	/*
>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>> +	 * buffers are sized correctly the flow control condition should be
>>> +	 * rare.
>>> +	 */
>>> +retry:
>>>    	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>> +		cond_resched();
>>> +		goto retry;
>>> +	}
>>
>> If this patch is about adding a non-blocking send function, and below we can
>> see that it creates a fork:
>>
>> intel_guc_ct_send:
>> ...
>> 	if (flags & INTEL_GUC_SEND_NB)
>> 		return ct_send_nb(ct, action, len, flags);
>>
>>   	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>
>> Then why is there a change in ct_send here, which is not the new
>> non-blocking path?
>>
> 
> There is not a change to ct_send(), just to intel_guc_ct_send.

I was doing by the diff which says:

  static int ct_send(struct intel_guc_ct *ct,
  		   const u32 *action,
  		   u32 len,
@@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
  		   u32 response_buf_size,
  		   u32 *status)
  {
+	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
  	struct ct_request request;
  	unsigned long flags;
  	u32 fence;
@@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
  	GEM_BUG_ON(!len);
  	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
  	GEM_BUG_ON(!response_buf && response_buf_size);
+	might_sleep();
  
+	/*
+	 * We use a lazy spin wait loop here as we believe that if the CT
+	 * buffers are sized correctly the flow control condition should be
+	 * rare.
+	 */
+retry:
  	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
+	if (unlikely(!ctb_has_room(ctb, len + 1))) {
+		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
+		cond_resched();
+		goto retry;
+	}

So it looks like a change to ct_send to me. Is that wrong?

Regards,

Tvrtko

> As for why intel_guc_ct_send is updated and we don't just a new public
> function, this was another reviewers suggestion. Again can't make
> everyone happy.
>   
>>>    	fence = ct_get_next_fence(ct);
>>>    	request.fence = fence;
>>> @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>    	list_add_tail(&request.link, &ct->requests.pending);
>>>    	spin_unlock(&ct->requests.lock);
>>> -	err = ct_write(ct, action, len, fence);
>>> +	err = ct_write(ct, action, len, fence, 0);
>>>    	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>> @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>     * Command Transport (CT) buffer based GuC send function.
>>>     */
>>>    int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>>> -		      u32 *response_buf, u32 response_buf_size)
>>> +		      u32 *response_buf, u32 response_buf_size, u32 flags)
>>>    {
>>>    	u32 status = ~0; /* undefined */
>>>    	int ret;
>>> @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>>>    		return -ENODEV;
>>>    	}
>>> +	if (flags & INTEL_GUC_SEND_NB)
>>> +		return ct_send_nb(ct, action, len, flags);
>>> +
>>>    	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>>    	if (unlikely(ret < 0)) {
>>>    		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
>>> index 1ae2dde6db93..55ef7c52472f 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
>>> @@ -9,6 +9,7 @@
>>>    #include <linux/interrupt.h>
>>>    #include <linux/spinlock.h>
>>>    #include <linux/workqueue.h>
>>> +#include <linux/ktime.h>
>>>    #include "intel_guc_fwif.h"
>>> @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
>>>    	bool broken;
>>>    };
>>> -
>>>    /** Top-level structure for Command Transport related data
>>>     *
>>>     * Includes a pair of CT buffers for bi-directional communication and tracking
>>> @@ -69,6 +69,9 @@ struct intel_guc_ct {
>>>    		struct list_head incoming; /* incoming requests */
>>>    		struct work_struct worker; /* handler for incoming requests */
>>>    	} requests;
>>> +
>>> +	/** @stall_time: time of first time a CTB submission is stalled */
>>> +	ktime_t stall_time;
>>
>> Unused in this patch.
>>
> 
> Yea, wrong patch. Will fix.
> 
> Matt
>   
>>>    };
>>>    void intel_guc_ct_init_early(struct intel_guc_ct *ct);
>>> @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
>>>    }
>>>    int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>>> -		      u32 *response_buf, u32 response_buf_size);
>>> +		      u32 *response_buf, u32 response_buf_size, u32 flags);
>>>    void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
>>>    #endif /* _INTEL_GUC_CT_H_ */
>>>
>>
>> Regards,
>>
>> Tvrtko
Matthew Brost May 26, 2021, 6:10 p.m. UTC | #6
On Wed, May 26, 2021 at 09:57:10AM +0100, Tvrtko Ursulin wrote:
> 
> On 25/05/2021 18:21, Matthew Brost wrote:
> > On Tue, May 25, 2021 at 10:21:00AM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 06/05/2021 20:13, Matthew Brost wrote:
> > > > Add non blocking CTB send function, intel_guc_send_nb. In order to
> > > > support a non blocking CTB send function a spin lock is needed to
> > > > protect the CTB descriptors fields. Also the non blocking call must not
> > > > update the fence value as this value is owned by the blocking call
> > > > (intel_guc_send).
> > > 
> > > Could the commit message say why the non-blocking send function is needed?
> > > 
> > 
> > Sure. Something like:
> > 
> > 'CTBs will be used in the critical patch of GuC submission and there is
> > no need to wait for each CTB complete before moving on the i915'
> 
> A bit more, like also mentioning the critical path is with interrupts disabled or so. And not just that there is no need to wait but waiting is not possible because this or that. So only choice is to do this busy loop send. It's a bit horrible so justification needs to be documented.
> 

Don't I basically say all this? Anyways I'll scrub this comment.

> > > > The blocking CTB now must have a flow control mechanism to ensure the
> > > > buffer isn't overrun. A lazy spin wait is used as we believe the flow
> > > > control condition should be rare with properly sized buffer.
> > > > 
> > > > The function, intel_guc_send_nb, is exported in this patch but unused.
> > > > Several patches later in the series make use of this function.
> > > > 
> > > > Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/i915/gt/uc/intel_guc.h    | 12 ++-
> > > >    drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 96 +++++++++++++++++++++--
> > > >    drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  7 +-
> > > >    3 files changed, 105 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > > > index c20f3839de12..4c0a367e41d8 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > > > @@ -75,7 +75,15 @@ static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
> > > >    static
> > > >    inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
> > > >    {
> > > > -	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
> > > > +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
> > > > +}
> > > > +
> > > > +#define INTEL_GUC_SEND_NB		BIT(31)
> > > > +static
> > > > +inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
> > > > +{
> > > > +	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
> > > > +				 INTEL_GUC_SEND_NB);
> > > >    }
> > > >    static inline int
> > > > @@ -83,7 +91,7 @@ intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
> > > >    			   u32 *response_buf, u32 response_buf_size)
> > > >    {
> > > >    	return intel_guc_ct_send(&guc->ct, action, len,
> > > > -				 response_buf, response_buf_size);
> > > > +				 response_buf, response_buf_size, 0);
> > > >    }
> > > >    static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > > index a76603537fa8..af7314d45a78 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > > > @@ -3,6 +3,11 @@
> > > >     * Copyright © 2016-2019 Intel Corporation
> > > >     */
> > > > +#include <linux/circ_buf.h>
> > > > +#include <linux/ktime.h>
> > > > +#include <linux/time64.h>
> > > > +#include <linux/timekeeping.h>
> > > > +
> > > >    #include "i915_drv.h"
> > > >    #include "intel_guc_ct.h"
> > > >    #include "gt/intel_gt.h"
> > > > @@ -308,6 +313,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
> > > >    	if (unlikely(err))
> > > >    		goto err_deregister;
> > > > +	ct->requests.last_fence = 1;
> > > >    	ct->enabled = true;
> > > >    	return 0;
> > > > @@ -343,10 +349,22 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
> > > >    	return ++ct->requests.last_fence;
> > > >    }
> > > > +static void write_barrier(struct intel_guc_ct *ct) {
> > > > +	struct intel_guc *guc = ct_to_guc(ct);
> > > > +	struct intel_gt *gt = guc_to_gt(guc);
> > > > +
> > > > +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > > > +		GEM_BUG_ON(guc->send_regs.fw_domains);
> > > > +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> > > 
> > > It's safe to write to this reg? Does it need a comment to explain it?
> > > 
> > 
> > Yes, it is same. IMO 'SCRATCH' in the name is enough documentation.
> 
> Why would it be enough? It requires digging to figure it out since it appears these are some sort of GuC special registers and not generic scratch:
> 
> commit 2d4ed3a988e6b1ff9729d0edd74bf4890571253e
> Author: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Date:   Mon May 27 18:36:05 2019 +0000
> 
>     drm/i915/guc: New GuC scratch registers for Gen11
> 
> If it was a normal scratch then async trashing of those from a random driver thread isn't per se safe if used from a GPU context running in parallel.
> 
> But then according to bspec they are called VF_SW_FLAG_<n> and not GEN11_SOFT_SCRATCH so yeah.
> 

Moved this part into its own patch and added comment indicating why they
are safe to use.

Matt 

> > > > +	} else {
> > > > +		wmb();
> > > > +	}
> > > > +}
> > > > +
> > > >    static int ct_write(struct intel_guc_ct *ct,
> > > >    		    const u32 *action,
> > > >    		    u32 len /* in dwords */,
> > > > -		    u32 fence)
> > > > +		    u32 fence, u32 flags)
> > > >    {
> > > >    	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > >    	struct guc_ct_buffer_desc *desc = ctb->desc;
> > > > @@ -393,9 +411,13 @@ static int ct_write(struct intel_guc_ct *ct,
> > > >    		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
> > > >    		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
> > > > -	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> > > > -	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> > > > -			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
> > > > +	hxg = (flags & INTEL_GUC_SEND_NB) ?
> > > > +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> > > > +		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
> > > > +			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
> > > > +		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
> > > > +		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
> > > > +			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
> > > >    	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
> > > >    		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
> > > > @@ -412,6 +434,12 @@ static int ct_write(struct intel_guc_ct *ct,
> > > >    	}
> > > >    	GEM_BUG_ON(tail > size);
> > > > +	/*
> > > > +	 * make sure H2G buffer update and LRC tail update (if this triggering a
> > > > +	 * submission) are visable before updating the descriptor tail
> > > > +	 */
> > > > +	write_barrier(ct);
> > > > +
> > > >    	/* now update descriptor */
> > > >    	WRITE_ONCE(desc->tail, tail);
> > > > @@ -466,6 +494,46 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
> > > >    	return err;
> > > >    }
> > > > +static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
> > > > +{
> > > > +	struct guc_ct_buffer_desc *desc = ctb->desc;
> > > > +	u32 head = READ_ONCE(desc->head);
> > > > +	u32 space;
> > > > +
> > > > +	space = CIRC_SPACE(desc->tail, head, ctb->size);
> > > > +
> > > > +	return space >= len_dw;
> > > > +}
> > > > +
> > > > +static int ct_send_nb(struct intel_guc_ct *ct,
> > > > +		      const u32 *action,
> > > > +		      u32 len,
> > > > +		      u32 flags)
> > > > +{
> > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > +	unsigned long spin_flags;
> > > > +	u32 fence;
> > > > +	int ret;
> > > > +
> > > > +	spin_lock_irqsave(&ctb->lock, spin_flags);
> > > > +
> > > > +	ret = ctb_has_room(ctb, len + 1);
> > > > +	if (unlikely(ret))
> > > > +		goto out;
> > > > +
> > > > +	fence = ct_get_next_fence(ct);
> > > > +	ret = ct_write(ct, action, len, fence, flags);
> > > > +	if (unlikely(ret))
> > > > +		goto out;
> > > > +
> > > > +	intel_guc_notify(ct_to_guc(ct));
> > > > +
> > > > +out:
> > > > +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > >    static int ct_send(struct intel_guc_ct *ct,
> > > >    		   const u32 *action,
> > > >    		   u32 len,
> > > > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >    		   u32 response_buf_size,
> > > >    		   u32 *status)
> > > >    {
> > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > >    	struct ct_request request;
> > > >    	unsigned long flags;
> > > >    	u32 fence;
> > > > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >    	GEM_BUG_ON(!len);
> > > >    	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > >    	GEM_BUG_ON(!response_buf && response_buf_size);
> > > > +	might_sleep();
> > > 
> > > Sleep is just cond_resched below or there is more?
> > > 
> > 
> > Yes, the cond_resched.
> > 
> > > > +	/*
> > > > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > > > +	 * buffers are sized correctly the flow control condition should be
> > > > +	 * rare.
> > > > +	 */
> > > > +retry:
> > > >    	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > +		cond_resched();
> > > > +		goto retry;
> > > > +	}
> > > 
> > > If this patch is about adding a non-blocking send function, and below we can
> > > see that it creates a fork:
> > > 
> > > intel_guc_ct_send:
> > > ...
> > > 	if (flags & INTEL_GUC_SEND_NB)
> > > 		return ct_send_nb(ct, action, len, flags);
> > > 
> > >   	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > 
> > > Then why is there a change in ct_send here, which is not the new
> > > non-blocking path?
> > > 
> > 
> > There is not a change to ct_send(), just to intel_guc_ct_send.
> 
> I was doing by the diff which says:
> 
>  static int ct_send(struct intel_guc_ct *ct,
>  		   const u32 *action,
>  		   u32 len,
> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>  		   u32 response_buf_size,
>  		   u32 *status)
>  {
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>  	struct ct_request request;
>  	unsigned long flags;
>  	u32 fence;
> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>  	GEM_BUG_ON(!len);
>  	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>  	GEM_BUG_ON(!response_buf && response_buf_size);
> +	might_sleep();
> +	/*
> +	 * We use a lazy spin wait loop here as we believe that if the CT
> +	 * buffers are sized correctly the flow control condition should be
> +	 * rare.
> +	 */
> +retry:
>  	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> +		cond_resched();
> +		goto retry;
> +	}
> 
> So it looks like a change to ct_send to me. Is that wrong?
> 
> Regards,
> 
> Tvrtko
> 
> > As for why intel_guc_ct_send is updated and we don't just a new public
> > function, this was another reviewers suggestion. Again can't make
> > everyone happy.
> > > >    	fence = ct_get_next_fence(ct);
> > > >    	request.fence = fence;
> > > > @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >    	list_add_tail(&request.link, &ct->requests.pending);
> > > >    	spin_unlock(&ct->requests.lock);
> > > > -	err = ct_write(ct, action, len, fence);
> > > > +	err = ct_write(ct, action, len, fence, 0);
> > > >    	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >     * Command Transport (CT) buffer based GuC send function.
> > > >     */
> > > >    int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > > > -		      u32 *response_buf, u32 response_buf_size)
> > > > +		      u32 *response_buf, u32 response_buf_size, u32 flags)
> > > >    {
> > > >    	u32 status = ~0; /* undefined */
> > > >    	int ret;
> > > > @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > > >    		return -ENODEV;
> > > >    	}
> > > > +	if (flags & INTEL_GUC_SEND_NB)
> > > > +		return ct_send_nb(ct, action, len, flags);
> > > > +
> > > >    	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > >    	if (unlikely(ret < 0)) {
> > > >    		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > > > index 1ae2dde6db93..55ef7c52472f 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > > > @@ -9,6 +9,7 @@
> > > >    #include <linux/interrupt.h>
> > > >    #include <linux/spinlock.h>
> > > >    #include <linux/workqueue.h>
> > > > +#include <linux/ktime.h>
> > > >    #include "intel_guc_fwif.h"
> > > > @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
> > > >    	bool broken;
> > > >    };
> > > > -
> > > >    /** Top-level structure for Command Transport related data
> > > >     *
> > > >     * Includes a pair of CT buffers for bi-directional communication and tracking
> > > > @@ -69,6 +69,9 @@ struct intel_guc_ct {
> > > >    		struct list_head incoming; /* incoming requests */
> > > >    		struct work_struct worker; /* handler for incoming requests */
> > > >    	} requests;
> > > > +
> > > > +	/** @stall_time: time of first time a CTB submission is stalled */
> > > > +	ktime_t stall_time;
> > > 
> > > Unused in this patch.
> > > 
> > 
> > Yea, wrong patch. Will fix.
> > 
> > Matt
> > > >    };
> > > >    void intel_guc_ct_init_early(struct intel_guc_ct *ct);
> > > > @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
> > > >    }
> > > >    int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > > > -		      u32 *response_buf, u32 response_buf_size);
> > > > +		      u32 *response_buf, u32 response_buf_size, u32 flags);
> > > >    void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
> > > >    #endif /* _INTEL_GUC_CT_H_ */
> > > > 
> > > 
> > > Regards,
> > > 
> > > Tvrtko
Tvrtko Ursulin May 27, 2021, 10:02 a.m. UTC | #7
On 26/05/2021 19:10, Matthew Brost wrote:

[snip]

>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
>>>>> +		      const u32 *action,
>>>>> +		      u32 len,
>>>>> +		      u32 flags)
>>>>> +{
>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>> +	unsigned long spin_flags;
>>>>> +	u32 fence;
>>>>> +	int ret;
>>>>> +
>>>>> +	spin_lock_irqsave(&ctb->lock, spin_flags);
>>>>> +
>>>>> +	ret = ctb_has_room(ctb, len + 1);
>>>>> +	if (unlikely(ret))
>>>>> +		goto out;
>>>>> +
>>>>> +	fence = ct_get_next_fence(ct);
>>>>> +	ret = ct_write(ct, action, len, fence, flags);
>>>>> +	if (unlikely(ret))
>>>>> +		goto out;
>>>>> +
>>>>> +	intel_guc_notify(ct_to_guc(ct));
>>>>> +
>>>>> +out:
>>>>> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
>>>>> +
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>>     static int ct_send(struct intel_guc_ct *ct,
>>>>>     		   const u32 *action,
>>>>>     		   u32 len,
>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>     		   u32 response_buf_size,
>>>>>     		   u32 *status)
>>>>>     {
>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>     	struct ct_request request;
>>>>>     	unsigned long flags;
>>>>>     	u32 fence;
>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>     	GEM_BUG_ON(!len);
>>>>>     	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>     	GEM_BUG_ON(!response_buf && response_buf_size);
>>>>> +	might_sleep();
>>>>
>>>> Sleep is just cond_resched below or there is more?
>>>>
>>>
>>> Yes, the cond_resched.
>>>
>>>>> +	/*
>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>>> +	 * buffers are sized correctly the flow control condition should be
>>>>> +	 * rare.
>>>>> +	 */
>>>>> +retry:
>>>>>     	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>> +		cond_resched();
>>>>> +		goto retry;
>>>>> +	}
>>>>
>>>> If this patch is about adding a non-blocking send function, and below we can
>>>> see that it creates a fork:
>>>>
>>>> intel_guc_ct_send:
>>>> ...
>>>> 	if (flags & INTEL_GUC_SEND_NB)
>>>> 		return ct_send_nb(ct, action, len, flags);
>>>>
>>>>    	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>>>
>>>> Then why is there a change in ct_send here, which is not the new
>>>> non-blocking path?
>>>>
>>>
>>> There is not a change to ct_send(), just to intel_guc_ct_send.
>>
>> I was doing by the diff which says:
>>
>>   static int ct_send(struct intel_guc_ct *ct,
>>   		   const u32 *action,
>>   		   u32 len,
>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>   		   u32 response_buf_size,
>>   		   u32 *status)
>>   {
>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>   	struct ct_request request;
>>   	unsigned long flags;
>>   	u32 fence;
>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>   	GEM_BUG_ON(!len);
>>   	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>   	GEM_BUG_ON(!response_buf && response_buf_size);
>> +	might_sleep();
>> +	/*
>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>> +	 * buffers are sized correctly the flow control condition should be
>> +	 * rare.
>> +	 */
>> +retry:
>>   	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>> +		cond_resched();
>> +		goto retry;
>> +	}
>>
>> So it looks like a change to ct_send to me. Is that wrong?

What about this part - is the patch changing the blocking ct_send or 
not, and if it is why?

Regards,

Tvrtko


>>
>> Regards,
>>
>> Tvrtko
>>
>>> As for why intel_guc_ct_send is updated and we don't just a new public
>>> function, this was another reviewers suggestion. Again can't make
>>> everyone happy.
>>>>>     	fence = ct_get_next_fence(ct);
>>>>>     	request.fence = fence;
>>>>> @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>     	list_add_tail(&request.link, &ct->requests.pending);
>>>>>     	spin_unlock(&ct->requests.lock);
>>>>> -	err = ct_write(ct, action, len, fence);
>>>>> +	err = ct_write(ct, action, len, fence, 0);
>>>>>     	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>> @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>      * Command Transport (CT) buffer based GuC send function.
>>>>>      */
>>>>>     int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>>>>> -		      u32 *response_buf, u32 response_buf_size)
>>>>> +		      u32 *response_buf, u32 response_buf_size, u32 flags)
>>>>>     {
>>>>>     	u32 status = ~0; /* undefined */
>>>>>     	int ret;
>>>>> @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>>>>>     		return -ENODEV;
>>>>>     	}
>>>>> +	if (flags & INTEL_GUC_SEND_NB)
>>>>> +		return ct_send_nb(ct, action, len, flags);
>>>>> +
>>>>>     	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>>>>     	if (unlikely(ret < 0)) {
>>>>>     		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
>>>>> index 1ae2dde6db93..55ef7c52472f 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
>>>>> @@ -9,6 +9,7 @@
>>>>>     #include <linux/interrupt.h>
>>>>>     #include <linux/spinlock.h>
>>>>>     #include <linux/workqueue.h>
>>>>> +#include <linux/ktime.h>
>>>>>     #include "intel_guc_fwif.h"
>>>>> @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
>>>>>     	bool broken;
>>>>>     };
>>>>> -
>>>>>     /** Top-level structure for Command Transport related data
>>>>>      *
>>>>>      * Includes a pair of CT buffers for bi-directional communication and tracking
>>>>> @@ -69,6 +69,9 @@ struct intel_guc_ct {
>>>>>     		struct list_head incoming; /* incoming requests */
>>>>>     		struct work_struct worker; /* handler for incoming requests */
>>>>>     	} requests;
>>>>> +
>>>>> +	/** @stall_time: time of first time a CTB submission is stalled */
>>>>> +	ktime_t stall_time;
>>>>
>>>> Unused in this patch.
>>>>
>>>
>>> Yea, wrong patch. Will fix.
>>>
>>> Matt
>>>>>     };
>>>>>     void intel_guc_ct_init_early(struct intel_guc_ct *ct);
>>>>> @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
>>>>>     }
>>>>>     int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>>>>> -		      u32 *response_buf, u32 response_buf_size);
>>>>> +		      u32 *response_buf, u32 response_buf_size, u32 flags);
>>>>>     void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
>>>>>     #endif /* _INTEL_GUC_CT_H_ */
>>>>>
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
Matthew Brost May 27, 2021, 2:35 p.m. UTC | #8
On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> 
> On 26/05/2021 19:10, Matthew Brost wrote:
> 
> [snip]
> 
> > > > > > +static int ct_send_nb(struct intel_guc_ct *ct,
> > > > > > +		      const u32 *action,
> > > > > > +		      u32 len,
> > > > > > +		      u32 flags)
> > > > > > +{
> > > > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > > +	unsigned long spin_flags;
> > > > > > +	u32 fence;
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	spin_lock_irqsave(&ctb->lock, spin_flags);
> > > > > > +
> > > > > > +	ret = ctb_has_room(ctb, len + 1);
> > > > > > +	if (unlikely(ret))
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	fence = ct_get_next_fence(ct);
> > > > > > +	ret = ct_write(ct, action, len, fence, flags);
> > > > > > +	if (unlikely(ret))
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	intel_guc_notify(ct_to_guc(ct));
> > > > > > +
> > > > > > +out:
> > > > > > +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > > > > > +
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > >     static int ct_send(struct intel_guc_ct *ct,
> > > > > >     		   const u32 *action,
> > > > > >     		   u32 len,
> > > > > > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >     		   u32 response_buf_size,
> > > > > >     		   u32 *status)
> > > > > >     {
> > > > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > >     	struct ct_request request;
> > > > > >     	unsigned long flags;
> > > > > >     	u32 fence;
> > > > > > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >     	GEM_BUG_ON(!len);
> > > > > >     	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > > >     	GEM_BUG_ON(!response_buf && response_buf_size);
> > > > > > +	might_sleep();
> > > > > 
> > > > > Sleep is just cond_resched below or there is more?
> > > > > 
> > > > 
> > > > Yes, the cond_resched.
> > > > 
> > > > > > +	/*
> > > > > > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > > > > > +	 * buffers are sized correctly the flow control condition should be
> > > > > > +	 * rare.
> > > > > > +	 */
> > > > > > +retry:
> > > > > >     	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > > > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > > > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > > > +		cond_resched();
> > > > > > +		goto retry;
> > > > > > +	}
> > > > > 
> > > > > If this patch is about adding a non-blocking send function, and below we can
> > > > > see that it creates a fork:
> > > > > 
> > > > > intel_guc_ct_send:
> > > > > ...
> > > > > 	if (flags & INTEL_GUC_SEND_NB)
> > > > > 		return ct_send_nb(ct, action, len, flags);
> > > > > 
> > > > >    	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > > > 
> > > > > Then why is there a change in ct_send here, which is not the new
> > > > > non-blocking path?
> > > > > 
> > > > 
> > > > There is not a change to ct_send(), just to intel_guc_ct_send.
> > > 
> > > I was doing by the diff which says:
> > > 
> > >   static int ct_send(struct intel_guc_ct *ct,
> > >   		   const u32 *action,
> > >   		   u32 len,
> > > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > >   		   u32 response_buf_size,
> > >   		   u32 *status)
> > >   {
> > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > >   	struct ct_request request;
> > >   	unsigned long flags;
> > >   	u32 fence;
> > > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > >   	GEM_BUG_ON(!len);
> > >   	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > >   	GEM_BUG_ON(!response_buf && response_buf_size);
> > > +	might_sleep();
> > > +	/*
> > > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > > +	 * buffers are sized correctly the flow control condition should be
> > > +	 * rare.
> > > +	 */
> > > +retry:
> > >   	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > +		cond_resched();
> > > +		goto retry;
> > > +	}
> > > 
> > > So it looks like a change to ct_send to me. Is that wrong?
> 
> What about this part - is the patch changing the blocking ct_send or not,
> and if it is why?
> 

Yes, ct_send() changes. Sorry for the confusion.

This function needs to be updated to account for the H2G space and
backoff if no space is available.

Matt

> Regards,
> 
> Tvrtko
> 
> 
> > > 
> > > Regards,
> > > 
> > > Tvrtko
> > > 
> > > > As for why intel_guc_ct_send is updated and we don't just a new public
> > > > function, this was another reviewers suggestion. Again can't make
> > > > everyone happy.
> > > > > >     	fence = ct_get_next_fence(ct);
> > > > > >     	request.fence = fence;
> > > > > > @@ -495,7 +576,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >     	list_add_tail(&request.link, &ct->requests.pending);
> > > > > >     	spin_unlock(&ct->requests.lock);
> > > > > > -	err = ct_write(ct, action, len, fence);
> > > > > > +	err = ct_write(ct, action, len, fence, 0);
> > > > > >     	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > > > @@ -537,7 +618,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >      * Command Transport (CT) buffer based GuC send function.
> > > > > >      */
> > > > > >     int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > > > > > -		      u32 *response_buf, u32 response_buf_size)
> > > > > > +		      u32 *response_buf, u32 response_buf_size, u32 flags)
> > > > > >     {
> > > > > >     	u32 status = ~0; /* undefined */
> > > > > >     	int ret;
> > > > > > @@ -547,6 +628,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > > > > >     		return -ENODEV;
> > > > > >     	}
> > > > > > +	if (flags & INTEL_GUC_SEND_NB)
> > > > > > +		return ct_send_nb(ct, action, len, flags);
> > > > > > +
> > > > > >     	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > > > >     	if (unlikely(ret < 0)) {
> > > > > >     		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
> > > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > > > > > index 1ae2dde6db93..55ef7c52472f 100644
> > > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> > > > > > @@ -9,6 +9,7 @@
> > > > > >     #include <linux/interrupt.h>
> > > > > >     #include <linux/spinlock.h>
> > > > > >     #include <linux/workqueue.h>
> > > > > > +#include <linux/ktime.h>
> > > > > >     #include "intel_guc_fwif.h"
> > > > > > @@ -42,7 +43,6 @@ struct intel_guc_ct_buffer {
> > > > > >     	bool broken;
> > > > > >     };
> > > > > > -
> > > > > >     /** Top-level structure for Command Transport related data
> > > > > >      *
> > > > > >      * Includes a pair of CT buffers for bi-directional communication and tracking
> > > > > > @@ -69,6 +69,9 @@ struct intel_guc_ct {
> > > > > >     		struct list_head incoming; /* incoming requests */
> > > > > >     		struct work_struct worker; /* handler for incoming requests */
> > > > > >     	} requests;
> > > > > > +
> > > > > > +	/** @stall_time: time of first time a CTB submission is stalled */
> > > > > > +	ktime_t stall_time;
> > > > > 
> > > > > Unused in this patch.
> > > > > 
> > > > 
> > > > Yea, wrong patch. Will fix.
> > > > 
> > > > Matt
> > > > > >     };
> > > > > >     void intel_guc_ct_init_early(struct intel_guc_ct *ct);
> > > > > > @@ -88,7 +91,7 @@ static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
> > > > > >     }
> > > > > >     int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
> > > > > > -		      u32 *response_buf, u32 response_buf_size);
> > > > > > +		      u32 *response_buf, u32 response_buf_size, u32 flags);
> > > > > >     void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
> > > > > >     #endif /* _INTEL_GUC_CT_H_ */
> > > > > > 
> > > > > 
> > > > > Regards,
> > > > > 
> > > > > Tvrtko
Tvrtko Ursulin May 27, 2021, 3:11 p.m. UTC | #9
On 27/05/2021 15:35, Matthew Brost wrote:
> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
>>
>> On 26/05/2021 19:10, Matthew Brost wrote:
>>
>> [snip]
>>
>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
>>>>>>> +		      const u32 *action,
>>>>>>> +		      u32 len,
>>>>>>> +		      u32 flags)
>>>>>>> +{
>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>> +	unsigned long spin_flags;
>>>>>>> +	u32 fence;
>>>>>>> +	int ret;
>>>>>>> +
>>>>>>> +	spin_lock_irqsave(&ctb->lock, spin_flags);
>>>>>>> +
>>>>>>> +	ret = ctb_has_room(ctb, len + 1);
>>>>>>> +	if (unlikely(ret))
>>>>>>> +		goto out;
>>>>>>> +
>>>>>>> +	fence = ct_get_next_fence(ct);
>>>>>>> +	ret = ct_write(ct, action, len, fence, flags);
>>>>>>> +	if (unlikely(ret))
>>>>>>> +		goto out;
>>>>>>> +
>>>>>>> +	intel_guc_notify(ct_to_guc(ct));
>>>>>>> +
>>>>>>> +out:
>>>>>>> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
>>>>>>> +
>>>>>>> +	return ret;
>>>>>>> +}
>>>>>>> +
>>>>>>>      static int ct_send(struct intel_guc_ct *ct,
>>>>>>>      		   const u32 *action,
>>>>>>>      		   u32 len,
>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>      		   u32 response_buf_size,
>>>>>>>      		   u32 *status)
>>>>>>>      {
>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>      	struct ct_request request;
>>>>>>>      	unsigned long flags;
>>>>>>>      	u32 fence;
>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>      	GEM_BUG_ON(!len);
>>>>>>>      	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>>      	GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>>> +	might_sleep();
>>>>>>
>>>>>> Sleep is just cond_resched below or there is more?
>>>>>>
>>>>>
>>>>> Yes, the cond_resched.
>>>>>
>>>>>>> +	/*
>>>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>>>>> +	 * buffers are sized correctly the flow control condition should be
>>>>>>> +	 * rare.
>>>>>>> +	 */
>>>>>>> +retry:
>>>>>>>      	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>>> +		cond_resched();
>>>>>>> +		goto retry;
>>>>>>> +	}
>>>>>>
>>>>>> If this patch is about adding a non-blocking send function, and below we can
>>>>>> see that it creates a fork:
>>>>>>
>>>>>> intel_guc_ct_send:
>>>>>> ...
>>>>>> 	if (flags & INTEL_GUC_SEND_NB)
>>>>>> 		return ct_send_nb(ct, action, len, flags);
>>>>>>
>>>>>>     	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>>>>>
>>>>>> Then why is there a change in ct_send here, which is not the new
>>>>>> non-blocking path?
>>>>>>
>>>>>
>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
>>>>
>>>> I was doing by the diff which says:
>>>>
>>>>    static int ct_send(struct intel_guc_ct *ct,
>>>>    		   const u32 *action,
>>>>    		   u32 len,
>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>    		   u32 response_buf_size,
>>>>    		   u32 *status)
>>>>    {
>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>    	struct ct_request request;
>>>>    	unsigned long flags;
>>>>    	u32 fence;
>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>    	GEM_BUG_ON(!len);
>>>>    	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>    	GEM_BUG_ON(!response_buf && response_buf_size);
>>>> +	might_sleep();
>>>> +	/*
>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>> +	 * buffers are sized correctly the flow control condition should be
>>>> +	 * rare.
>>>> +	 */
>>>> +retry:
>>>>    	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>> +		cond_resched();
>>>> +		goto retry;
>>>> +	}
>>>>
>>>> So it looks like a change to ct_send to me. Is that wrong?
>>
>> What about this part - is the patch changing the blocking ct_send or not,
>> and if it is why?
>>
> 
> Yes, ct_send() changes. Sorry for the confusion.
> 
> This function needs to be updated to account for the H2G space and
> backoff if no space is available.

Since this one is the sleeping path, it probably can and needs to be 
smarter than having a cond_resched busy loop added. Like sleep and get 
woken up when there is space. Otherwise it can degenerate to busy 
looping via contention with the non-blocking path.

Regards,

Tvrtko
Matthew Brost June 7, 2021, 5:31 p.m. UTC | #10
On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> 
> On 27/05/2021 15:35, Matthew Brost wrote:
> > On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 26/05/2021 19:10, Matthew Brost wrote:
> > > 
> > > [snip]
> > > 
> > > > > > > > +static int ct_send_nb(struct intel_guc_ct *ct,
> > > > > > > > +		      const u32 *action,
> > > > > > > > +		      u32 len,
> > > > > > > > +		      u32 flags)
> > > > > > > > +{
> > > > > > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > > > > +	unsigned long spin_flags;
> > > > > > > > +	u32 fence;
> > > > > > > > +	int ret;
> > > > > > > > +
> > > > > > > > +	spin_lock_irqsave(&ctb->lock, spin_flags);
> > > > > > > > +
> > > > > > > > +	ret = ctb_has_room(ctb, len + 1);
> > > > > > > > +	if (unlikely(ret))
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	fence = ct_get_next_fence(ct);
> > > > > > > > +	ret = ct_write(ct, action, len, fence, flags);
> > > > > > > > +	if (unlikely(ret))
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	intel_guc_notify(ct_to_guc(ct));
> > > > > > > > +
> > > > > > > > +out:
> > > > > > > > +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > > > > > > > +
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >      static int ct_send(struct intel_guc_ct *ct,
> > > > > > > >      		   const u32 *action,
> > > > > > > >      		   u32 len,
> > > > > > > > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > > > >      		   u32 response_buf_size,
> > > > > > > >      		   u32 *status)
> > > > > > > >      {
> > > > > > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > > > >      	struct ct_request request;
> > > > > > > >      	unsigned long flags;
> > > > > > > >      	u32 fence;
> > > > > > > > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > > > >      	GEM_BUG_ON(!len);
> > > > > > > >      	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > > > > >      	GEM_BUG_ON(!response_buf && response_buf_size);
> > > > > > > > +	might_sleep();
> > > > > > > 
> > > > > > > Sleep is just cond_resched below or there is more?
> > > > > > > 
> > > > > > 
> > > > > > Yes, the cond_resched.
> > > > > > 
> > > > > > > > +	/*
> > > > > > > > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > > > > > > > +	 * buffers are sized correctly the flow control condition should be
> > > > > > > > +	 * rare.
> > > > > > > > +	 */
> > > > > > > > +retry:
> > > > > > > >      	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > > > > > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > > > > > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > > > > > +		cond_resched();
> > > > > > > > +		goto retry;
> > > > > > > > +	}
> > > > > > > 
> > > > > > > If this patch is about adding a non-blocking send function, and below we can
> > > > > > > see that it creates a fork:
> > > > > > > 
> > > > > > > intel_guc_ct_send:
> > > > > > > ...
> > > > > > > 	if (flags & INTEL_GUC_SEND_NB)
> > > > > > > 		return ct_send_nb(ct, action, len, flags);
> > > > > > > 
> > > > > > >     	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > > > > > 
> > > > > > > Then why is there a change in ct_send here, which is not the new
> > > > > > > non-blocking path?
> > > > > > > 
> > > > > > 
> > > > > > There is not a change to ct_send(), just to intel_guc_ct_send.
> > > > > 
> > > > > I was doing by the diff which says:
> > > > > 
> > > > >    static int ct_send(struct intel_guc_ct *ct,
> > > > >    		   const u32 *action,
> > > > >    		   u32 len,
> > > > > @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > >    		   u32 response_buf_size,
> > > > >    		   u32 *status)
> > > > >    {
> > > > > +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > >    	struct ct_request request;
> > > > >    	unsigned long flags;
> > > > >    	u32 fence;
> > > > > @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > >    	GEM_BUG_ON(!len);
> > > > >    	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > >    	GEM_BUG_ON(!response_buf && response_buf_size);
> > > > > +	might_sleep();
> > > > > +	/*
> > > > > +	 * We use a lazy spin wait loop here as we believe that if the CT
> > > > > +	 * buffers are sized correctly the flow control condition should be
> > > > > +	 * rare.
> > > > > +	 */
> > > > > +retry:
> > > > >    	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > > +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > > +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > > +		cond_resched();
> > > > > +		goto retry;
> > > > > +	}
> > > > > 
> > > > > So it looks like a change to ct_send to me. Is that wrong?
> > > 
> > > What about this part - is the patch changing the blocking ct_send or not,
> > > and if it is why?
> > > 
> > 
> > Yes, ct_send() changes. Sorry for the confusion.
> > 
> > This function needs to be updated to account for the H2G space and
> > backoff if no space is available.
> 
> Since this one is the sleeping path, it probably can and needs to be smarter
> than having a cond_resched busy loop added. Like sleep and get woken up when
> there is space. Otherwise it can degenerate to busy looping via contention
> with the non-blocking path.
> 

That screams over enginerring a simple problem to me. If the CT channel
is full we are really in trouble anyways - i.e. the performance is going
to terrible as we overwhelmed the GuC with traffic. That being said,
IGTs can do this but that really isn't a real world use case. For the
real world, this buffer is large enough that it won't ever be full hence
the comment + lazy spin loop.

Next, it isn't like we get an interrupt or something when space
becomes available so how would we wake this thread? Could we come up
with a convoluted scheme where we insert ops that generated an interrupt
at regular intervals, probably? Would it be super complicated, totally
unnecessary, and gain use nothing - absolutely.

Lastly, blocking CTBs really shouldn't ever be used. Certainly the
submission code doesn't use these. I think SRIOV might, but those can
probably be reworked too to use non-blocking. At some point we might
want to scrub the driver and just delete the blocking path.

Matt

> Regards,

> 
> Tvrtko
Tvrtko Ursulin June 8, 2021, 8:39 a.m. UTC | #11
On 07/06/2021 18:31, Matthew Brost wrote:
> On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
>>
>> On 27/05/2021 15:35, Matthew Brost wrote:
>>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 26/05/2021 19:10, Matthew Brost wrote:
>>>>
>>>> [snip]
>>>>
>>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
>>>>>>>>> +		      const u32 *action,
>>>>>>>>> +		      u32 len,
>>>>>>>>> +		      u32 flags)
>>>>>>>>> +{
>>>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>>> +	unsigned long spin_flags;
>>>>>>>>> +	u32 fence;
>>>>>>>>> +	int ret;
>>>>>>>>> +
>>>>>>>>> +	spin_lock_irqsave(&ctb->lock, spin_flags);
>>>>>>>>> +
>>>>>>>>> +	ret = ctb_has_room(ctb, len + 1);
>>>>>>>>> +	if (unlikely(ret))
>>>>>>>>> +		goto out;
>>>>>>>>> +
>>>>>>>>> +	fence = ct_get_next_fence(ct);
>>>>>>>>> +	ret = ct_write(ct, action, len, fence, flags);
>>>>>>>>> +	if (unlikely(ret))
>>>>>>>>> +		goto out;
>>>>>>>>> +
>>>>>>>>> +	intel_guc_notify(ct_to_guc(ct));
>>>>>>>>> +
>>>>>>>>> +out:
>>>>>>>>> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
>>>>>>>>> +
>>>>>>>>> +	return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>       		   const u32 *action,
>>>>>>>>>       		   u32 len,
>>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>       		   u32 response_buf_size,
>>>>>>>>>       		   u32 *status)
>>>>>>>>>       {
>>>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>>>       	struct ct_request request;
>>>>>>>>>       	unsigned long flags;
>>>>>>>>>       	u32 fence;
>>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>       	GEM_BUG_ON(!len);
>>>>>>>>>       	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>>>>       	GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>>>>> +	might_sleep();
>>>>>>>>
>>>>>>>> Sleep is just cond_resched below or there is more?
>>>>>>>>
>>>>>>>
>>>>>>> Yes, the cond_resched.
>>>>>>>
>>>>>>>>> +	/*
>>>>>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>>>>>>> +	 * buffers are sized correctly the flow control condition should be
>>>>>>>>> +	 * rare.
>>>>>>>>> +	 */
>>>>>>>>> +retry:
>>>>>>>>>       	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>>>>> +		cond_resched();
>>>>>>>>> +		goto retry;
>>>>>>>>> +	}
>>>>>>>>
>>>>>>>> If this patch is about adding a non-blocking send function, and below we can
>>>>>>>> see that it creates a fork:
>>>>>>>>
>>>>>>>> intel_guc_ct_send:
>>>>>>>> ...
>>>>>>>> 	if (flags & INTEL_GUC_SEND_NB)
>>>>>>>> 		return ct_send_nb(ct, action, len, flags);
>>>>>>>>
>>>>>>>>      	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>>>>>>>
>>>>>>>> Then why is there a change in ct_send here, which is not the new
>>>>>>>> non-blocking path?
>>>>>>>>
>>>>>>>
>>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
>>>>>>
>>>>>> I was doing by the diff which says:
>>>>>>
>>>>>>     static int ct_send(struct intel_guc_ct *ct,
>>>>>>     		   const u32 *action,
>>>>>>     		   u32 len,
>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>     		   u32 response_buf_size,
>>>>>>     		   u32 *status)
>>>>>>     {
>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>     	struct ct_request request;
>>>>>>     	unsigned long flags;
>>>>>>     	u32 fence;
>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>     	GEM_BUG_ON(!len);
>>>>>>     	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>     	GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>> +	might_sleep();
>>>>>> +	/*
>>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>>>> +	 * buffers are sized correctly the flow control condition should be
>>>>>> +	 * rare.
>>>>>> +	 */
>>>>>> +retry:
>>>>>>     	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>> +		cond_resched();
>>>>>> +		goto retry;
>>>>>> +	}
>>>>>>
>>>>>> So it looks like a change to ct_send to me. Is that wrong?
>>>>
>>>> What about this part - is the patch changing the blocking ct_send or not,
>>>> and if it is why?
>>>>
>>>
>>> Yes, ct_send() changes. Sorry for the confusion.
>>>
>>> This function needs to be updated to account for the H2G space and
>>> backoff if no space is available.
>>
>> Since this one is the sleeping path, it probably can and needs to be smarter
>> than having a cond_resched busy loop added. Like sleep and get woken up when
>> there is space. Otherwise it can degenerate to busy looping via contention
>> with the non-blocking path.
>>
> 
> That screams over enginerring a simple problem to me. If the CT channel
> is full we are really in trouble anyways - i.e. the performance is going
> to terrible as we overwhelmed the GuC with traffic. That being said,

Performance of what would be terrible? Something relating to submitting 
new jobs to the GPU I guess. Or something SRIOV related as you hint below.

But there is no real reason why CPU cycles/power should suffer if GuC is 
busy.

Okay, if it can't happen in real world then it's possibly passable as a 
design of a communication interface. But to me it leaves a bad taste and 
a doubt that there is this other aspect of the real world. And that is 
when the unexpected happens. Even the most trivial things like a bug in 
GuC firmware causes the driver to busy spin in there. So not much 
happening on the machine but CPU cores pinned burning cycles in this 
code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage 
and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the 
machine". Oh well..

At least I think the commit message should spell out clearly that a busy 
looping path is being added to the sleeping send as a downside of 
implementation choices. Still, for the record, I object to the design.

Regards,

Tvrtko

> IGTs can do this but that really isn't a real world use case. For the
> real world, this buffer is large enough that it won't ever be full hence
> the comment + lazy spin loop.
> 
> Next, it isn't like we get an interrupt or something when space
> becomes available so how would we wake this thread? Could we come up
> with a convoluted scheme where we insert ops that generated an interrupt
> at regular intervals, probably? Would it be super complicated, totally
> unnecessary, and gain use nothing - absolutely.
> 
> Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> submission code doesn't use these. I think SRIOV might, but those can
> probably be reworked too to use non-blocking. At some point we might
> want to scrub the driver and just delete the blocking path.
> 
> Matt
> 
>> Regards,
> 
>>
>> Tvrtko
Daniel Vetter June 8, 2021, 8:46 a.m. UTC | #12
On Tue, Jun 8, 2021 at 10:39 AM Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
>
>
> On 07/06/2021 18:31, Matthew Brost wrote:
> > On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> >>
> >> On 27/05/2021 15:35, Matthew Brost wrote:
> >>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> >>>>
> >>>> On 26/05/2021 19:10, Matthew Brost wrote:
> >>>>
> >>>> [snip]
> >>>>
> >>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> >>>>>>>>> +                   const u32 *action,
> >>>>>>>>> +                   u32 len,
> >>>>>>>>> +                   u32 flags)
> >>>>>>>>> +{
> >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>>> +     unsigned long spin_flags;
> >>>>>>>>> +     u32 fence;
> >>>>>>>>> +     int ret;
> >>>>>>>>> +
> >>>>>>>>> +     spin_lock_irqsave(&ctb->lock, spin_flags);
> >>>>>>>>> +
> >>>>>>>>> +     ret = ctb_has_room(ctb, len + 1);
> >>>>>>>>> +     if (unlikely(ret))
> >>>>>>>>> +             goto out;
> >>>>>>>>> +
> >>>>>>>>> +     fence = ct_get_next_fence(ct);
> >>>>>>>>> +     ret = ct_write(ct, action, len, fence, flags);
> >>>>>>>>> +     if (unlikely(ret))
> >>>>>>>>> +             goto out;
> >>>>>>>>> +
> >>>>>>>>> +     intel_guc_notify(ct_to_guc(ct));
> >>>>>>>>> +
> >>>>>>>>> +out:
> >>>>>>>>> +     spin_unlock_irqrestore(&ctb->lock, spin_flags);
> >>>>>>>>> +
> >>>>>>>>> +     return ret;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>                          const u32 *action,
> >>>>>>>>>                          u32 len,
> >>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>                          u32 response_buf_size,
> >>>>>>>>>                          u32 *status)
> >>>>>>>>>       {
> >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>>>               struct ct_request request;
> >>>>>>>>>               unsigned long flags;
> >>>>>>>>>               u32 fence;
> >>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>               GEM_BUG_ON(!len);
> >>>>>>>>>               GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >>>>>>>>>               GEM_BUG_ON(!response_buf && response_buf_size);
> >>>>>>>>> +     might_sleep();
> >>>>>>>>
> >>>>>>>> Sleep is just cond_resched below or there is more?
> >>>>>>>>
> >>>>>>>
> >>>>>>> Yes, the cond_resched.
> >>>>>>>
> >>>>>>>>> +     /*
> >>>>>>>>> +      * We use a lazy spin wait loop here as we believe that if the CT
> >>>>>>>>> +      * buffers are sized correctly the flow control condition should be
> >>>>>>>>> +      * rare.
> >>>>>>>>> +      */
> >>>>>>>>> +retry:
> >>>>>>>>>               spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> >>>>>>>>> +     if (unlikely(!ctb_has_room(ctb, len + 1))) {
> >>>>>>>>> +             spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >>>>>>>>> +             cond_resched();
> >>>>>>>>> +             goto retry;
> >>>>>>>>> +     }
> >>>>>>>>
> >>>>>>>> If this patch is about adding a non-blocking send function, and below we can
> >>>>>>>> see that it creates a fork:
> >>>>>>>>
> >>>>>>>> intel_guc_ct_send:
> >>>>>>>> ...
> >>>>>>>>        if (flags & INTEL_GUC_SEND_NB)
> >>>>>>>>                return ct_send_nb(ct, action, len, flags);
> >>>>>>>>
> >>>>>>>>        ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> >>>>>>>>
> >>>>>>>> Then why is there a change in ct_send here, which is not the new
> >>>>>>>> non-blocking path?
> >>>>>>>>
> >>>>>>>
> >>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> >>>>>>
> >>>>>> I was doing by the diff which says:
> >>>>>>
> >>>>>>     static int ct_send(struct intel_guc_ct *ct,
> >>>>>>                     const u32 *action,
> >>>>>>                     u32 len,
> >>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>                     u32 response_buf_size,
> >>>>>>                     u32 *status)
> >>>>>>     {
> >>>>>> +        struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>          struct ct_request request;
> >>>>>>          unsigned long flags;
> >>>>>>          u32 fence;
> >>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>          GEM_BUG_ON(!len);
> >>>>>>          GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >>>>>>          GEM_BUG_ON(!response_buf && response_buf_size);
> >>>>>> +        might_sleep();
> >>>>>> +        /*
> >>>>>> +         * We use a lazy spin wait loop here as we believe that if the CT
> >>>>>> +         * buffers are sized correctly the flow control condition should be
> >>>>>> +         * rare.
> >>>>>> +         */
> >>>>>> +retry:
> >>>>>>          spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> >>>>>> +        if (unlikely(!ctb_has_room(ctb, len + 1))) {
> >>>>>> +                spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >>>>>> +                cond_resched();
> >>>>>> +                goto retry;
> >>>>>> +        }
> >>>>>>
> >>>>>> So it looks like a change to ct_send to me. Is that wrong?
> >>>>
> >>>> What about this part - is the patch changing the blocking ct_send or not,
> >>>> and if it is why?
> >>>>
> >>>
> >>> Yes, ct_send() changes. Sorry for the confusion.
> >>>
> >>> This function needs to be updated to account for the H2G space and
> >>> backoff if no space is available.
> >>
> >> Since this one is the sleeping path, it probably can and needs to be smarter
> >> than having a cond_resched busy loop added. Like sleep and get woken up when
> >> there is space. Otherwise it can degenerate to busy looping via contention
> >> with the non-blocking path.
> >>
> >
> > That screams over enginerring a simple problem to me. If the CT channel
> > is full we are really in trouble anyways - i.e. the performance is going
> > to terrible as we overwhelmed the GuC with traffic. That being said,
>
> Performance of what would be terrible? Something relating to submitting
> new jobs to the GPU I guess. Or something SRIOV related as you hint below.
>
> But there is no real reason why CPU cycles/power should suffer if GuC is
> busy.
>
> Okay, if it can't happen in real world then it's possibly passable as a
> design of a communication interface. But to me it leaves a bad taste and
> a doubt that there is this other aspect of the real world. And that is
> when the unexpected happens. Even the most trivial things like a bug in
> GuC firmware causes the driver to busy spin in there. So not much
> happening on the machine but CPU cores pinned burning cycles in this
> code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> machine". Oh well..
>
> At least I think the commit message should spell out clearly that a busy
> looping path is being added to the sleeping send as a downside of
> implementation choices. Still, for the record, I object to the design.
>
> > IGTs can do this but that really isn't a real world use case. For the
> > real world, this buffer is large enough that it won't ever be full hence
> > the comment + lazy spin loop.
> >
> > Next, it isn't like we get an interrupt or something when space
> > becomes available so how would we wake this thread? Could we come up
> > with a convoluted scheme where we insert ops that generated an interrupt
> > at regular intervals, probably? Would it be super complicated, totally
> > unnecessary, and gain use nothing - absolutely.
> >
> > Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> > submission code doesn't use these. I think SRIOV might, but those can
> > probably be reworked too to use non-blocking. At some point we might
> > want to scrub the driver and just delete the blocking path.

I'd do an s/cond_resched()/msleep(1)/ and comment explaining why we
just don't care about this. That checks of the cpu wasting in this
case (GuC is overloaded, it wont come back anytime soon anyway) and
explains why we really don't want to make this any more clever or
complex code (because comment can explain why we wont hit this in
actual real world usage except when something else is on fire already
anyway).

If you want to go absolutely overkill and it's not too much work, make
the msleep interruptible or check for signals, and bail out. That way
the process can be made unstuck with ^C at least.
-Daniel
Michal Wajdeczko June 9, 2021, 1:58 p.m. UTC | #13
On 08.06.2021 10:39, Tvrtko Ursulin wrote:
> 
> On 07/06/2021 18:31, Matthew Brost wrote:
>> On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
>>>
>>> On 27/05/2021 15:35, Matthew Brost wrote:
>>>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
>>>>>
>>>>> On 26/05/2021 19:10, Matthew Brost wrote:
>>>>>
>>>>> [snip]
>>>>>
>>>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
>>>>>>>>>> +              const u32 *action,
>>>>>>>>>> +              u32 len,
>>>>>>>>>> +              u32 flags)
>>>>>>>>>> +{
>>>>>>>>>> +    struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>>>> +    unsigned long spin_flags;
>>>>>>>>>> +    u32 fence;
>>>>>>>>>> +    int ret;
>>>>>>>>>> +
>>>>>>>>>> +    spin_lock_irqsave(&ctb->lock, spin_flags);
>>>>>>>>>> +
>>>>>>>>>> +    ret = ctb_has_room(ctb, len + 1);
>>>>>>>>>> +    if (unlikely(ret))
>>>>>>>>>> +        goto out;
>>>>>>>>>> +
>>>>>>>>>> +    fence = ct_get_next_fence(ct);
>>>>>>>>>> +    ret = ct_write(ct, action, len, fence, flags);
>>>>>>>>>> +    if (unlikely(ret))
>>>>>>>>>> +        goto out;
>>>>>>>>>> +
>>>>>>>>>> +    intel_guc_notify(ct_to_guc(ct));
>>>>>>>>>> +
>>>>>>>>>> +out:
>>>>>>>>>> +    spin_unlock_irqrestore(&ctb->lock, spin_flags);
>>>>>>>>>> +
>>>>>>>>>> +    return ret;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>>                  const u32 *action,
>>>>>>>>>>                  u32 len,
>>>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>>                  u32 response_buf_size,
>>>>>>>>>>                  u32 *status)
>>>>>>>>>>       {
>>>>>>>>>> +    struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>>>>           struct ct_request request;
>>>>>>>>>>           unsigned long flags;
>>>>>>>>>>           u32 fence;
>>>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>>           GEM_BUG_ON(!len);
>>>>>>>>>>           GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>>>>>           GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>>>>>> +    might_sleep();
>>>>>>>>>
>>>>>>>>> Sleep is just cond_resched below or there is more?
>>>>>>>>>
>>>>>>>>
>>>>>>>> Yes, the cond_resched.
>>>>>>>>
>>>>>>>>>> +    /*
>>>>>>>>>> +     * We use a lazy spin wait loop here as we believe that
>>>>>>>>>> if the CT
>>>>>>>>>> +     * buffers are sized correctly the flow control condition
>>>>>>>>>> should be
>>>>>>>>>> +     * rare.
>>>>>>>>>> +     */
>>>>>>>>>> +retry:
>>>>>>>>>>           spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>>>>>> +    if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>>>>>> +        spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>>>>>> +        cond_resched();
>>>>>>>>>> +        goto retry;
>>>>>>>>>> +    }
>>>>>>>>>
>>>>>>>>> If this patch is about adding a non-blocking send function, and
>>>>>>>>> below we can
>>>>>>>>> see that it creates a fork:
>>>>>>>>>
>>>>>>>>> intel_guc_ct_send:
>>>>>>>>> ...
>>>>>>>>>     if (flags & INTEL_GUC_SEND_NB)
>>>>>>>>>         return ct_send_nb(ct, action, len, flags);
>>>>>>>>>
>>>>>>>>>          ret = ct_send(ct, action, len, response_buf,
>>>>>>>>> response_buf_size, &status);
>>>>>>>>>
>>>>>>>>> Then why is there a change in ct_send here, which is not the new
>>>>>>>>> non-blocking path?
>>>>>>>>>
>>>>>>>>
>>>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
>>>>>>>
>>>>>>> I was doing by the diff which says:
>>>>>>>
>>>>>>>     static int ct_send(struct intel_guc_ct *ct,
>>>>>>>                const u32 *action,
>>>>>>>                u32 len,
>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>                u32 response_buf_size,
>>>>>>>                u32 *status)
>>>>>>>     {
>>>>>>> +    struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>         struct ct_request request;
>>>>>>>         unsigned long flags;
>>>>>>>         u32 fence;
>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>         GEM_BUG_ON(!len);
>>>>>>>         GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>>         GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>>> +    might_sleep();
>>>>>>> +    /*
>>>>>>> +     * We use a lazy spin wait loop here as we believe that if
>>>>>>> the CT
>>>>>>> +     * buffers are sized correctly the flow control condition
>>>>>>> should be
>>>>>>> +     * rare.
>>>>>>> +     */
>>>>>>> +retry:
>>>>>>>         spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>>> +    if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>>> +        spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>>> +        cond_resched();
>>>>>>> +        goto retry;
>>>>>>> +    }
>>>>>>>
>>>>>>> So it looks like a change to ct_send to me. Is that wrong?
>>>>>
>>>>> What about this part - is the patch changing the blocking ct_send
>>>>> or not,
>>>>> and if it is why?
>>>>>
>>>>
>>>> Yes, ct_send() changes. Sorry for the confusion.
>>>>
>>>> This function needs to be updated to account for the H2G space and
>>>> backoff if no space is available.
>>>
>>> Since this one is the sleeping path, it probably can and needs to be
>>> smarter
>>> than having a cond_resched busy loop added. Like sleep and get woken
>>> up when
>>> there is space. Otherwise it can degenerate to busy looping via
>>> contention
>>> with the non-blocking path.
>>>
>>
>> That screams over enginerring a simple problem to me. If the CT channel
>> is full we are really in trouble anyways - i.e. the performance is going
>> to terrible as we overwhelmed the GuC with traffic. That being said,
> 
> Performance of what would be terrible? Something relating to submitting
> new jobs to the GPU I guess. Or something SRIOV related as you hint below.
> 
> But there is no real reason why CPU cycles/power should suffer if GuC is
> busy.
> 
> Okay, if it can't happen in real world then it's possibly passable as a

if that can't happen in real world, then maybe we can just return
-ENOSPC/-EBUSY to report that 'unexpected' case, instead of hiding it
behind silent busy loop ?

> design of a communication interface. But to me it leaves a bad taste and
> a doubt that there is this other aspect of the real world. And that is
> when the unexpected happens. Even the most trivial things like a bug in
> GuC firmware causes the driver to busy spin in there. So not much
> happening on the machine but CPU cores pinned burning cycles in this
> code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> machine". Oh well..
> 
> At least I think the commit message should spell out clearly that a busy
> looping path is being added to the sleeping send as a downside of
> implementation choices. Still, for the record, I object to the design.
> 
> Regards,
> 
> Tvrtko
> 
>> IGTs can do this but that really isn't a real world use case. For the
>> real world, this buffer is large enough that it won't ever be full hence
>> the comment + lazy spin loop.
>>
>> Next, it isn't like we get an interrupt or something when space
>> becomes available so how would we wake this thread? Could we come up
>> with a convoluted scheme where we insert ops that generated an interrupt
>> at regular intervals, probably? Would it be super complicated, totally
>> unnecessary, and gain use nothing - absolutely.
>>
>> Lastly, blocking CTBs really shouldn't ever be used. Certainly the
>> submission code doesn't use these. I think SRIOV might, but those can
>> probably be reworked too to use non-blocking. At some point we might
>> want to scrub the driver and just delete the blocking path.
>>
>> Matt
>>
>>> Regards,
>>
>>>
>>> Tvrtko
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Michal Wajdeczko June 9, 2021, 2:14 p.m. UTC | #14
On 07.06.2021 19:31, Matthew Brost wrote:
> On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
>>
>> On 27/05/2021 15:35, Matthew Brost wrote:
>>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 26/05/2021 19:10, Matthew Brost wrote:
>>>>
>>>> [snip]
>>>>
>>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
>>>>>>>>> +		      const u32 *action,
>>>>>>>>> +		      u32 len,
>>>>>>>>> +		      u32 flags)
>>>>>>>>> +{
>>>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>>> +	unsigned long spin_flags;
>>>>>>>>> +	u32 fence;
>>>>>>>>> +	int ret;
>>>>>>>>> +
>>>>>>>>> +	spin_lock_irqsave(&ctb->lock, spin_flags);
>>>>>>>>> +
>>>>>>>>> +	ret = ctb_has_room(ctb, len + 1);
>>>>>>>>> +	if (unlikely(ret))
>>>>>>>>> +		goto out;
>>>>>>>>> +
>>>>>>>>> +	fence = ct_get_next_fence(ct);
>>>>>>>>> +	ret = ct_write(ct, action, len, fence, flags);
>>>>>>>>> +	if (unlikely(ret))
>>>>>>>>> +		goto out;
>>>>>>>>> +
>>>>>>>>> +	intel_guc_notify(ct_to_guc(ct));
>>>>>>>>> +
>>>>>>>>> +out:
>>>>>>>>> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
>>>>>>>>> +
>>>>>>>>> +	return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>      static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>      		   const u32 *action,
>>>>>>>>>      		   u32 len,
>>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>      		   u32 response_buf_size,
>>>>>>>>>      		   u32 *status)
>>>>>>>>>      {
>>>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>>>>      	struct ct_request request;
>>>>>>>>>      	unsigned long flags;
>>>>>>>>>      	u32 fence;
>>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>>>>      	GEM_BUG_ON(!len);
>>>>>>>>>      	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>>>>      	GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>>>>> +	might_sleep();
>>>>>>>>
>>>>>>>> Sleep is just cond_resched below or there is more?
>>>>>>>>
>>>>>>>
>>>>>>> Yes, the cond_resched.
>>>>>>>
>>>>>>>>> +	/*
>>>>>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>>>>>>> +	 * buffers are sized correctly the flow control condition should be
>>>>>>>>> +	 * rare.
>>>>>>>>> +	 */
>>>>>>>>> +retry:
>>>>>>>>>      	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>>>>> +		cond_resched();
>>>>>>>>> +		goto retry;
>>>>>>>>> +	}
>>>>>>>>
>>>>>>>> If this patch is about adding a non-blocking send function, and below we can
>>>>>>>> see that it creates a fork:
>>>>>>>>
>>>>>>>> intel_guc_ct_send:
>>>>>>>> ...
>>>>>>>> 	if (flags & INTEL_GUC_SEND_NB)
>>>>>>>> 		return ct_send_nb(ct, action, len, flags);
>>>>>>>>
>>>>>>>>     	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
>>>>>>>>
>>>>>>>> Then why is there a change in ct_send here, which is not the new
>>>>>>>> non-blocking path?
>>>>>>>>
>>>>>>>
>>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
>>>>>>
>>>>>> I was doing by the diff which says:
>>>>>>
>>>>>>    static int ct_send(struct intel_guc_ct *ct,
>>>>>>    		   const u32 *action,
>>>>>>    		   u32 len,
>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>    		   u32 response_buf_size,
>>>>>>    		   u32 *status)
>>>>>>    {
>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>>>>>    	struct ct_request request;
>>>>>>    	unsigned long flags;
>>>>>>    	u32 fence;
>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
>>>>>>    	GEM_BUG_ON(!len);
>>>>>>    	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>>>>>>    	GEM_BUG_ON(!response_buf && response_buf_size);
>>>>>> +	might_sleep();
>>>>>> +	/*
>>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
>>>>>> +	 * buffers are sized correctly the flow control condition should be
>>>>>> +	 * rare.
>>>>>> +	 */
>>>>>> +retry:
>>>>>>    	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>>>>> +		cond_resched();
>>>>>> +		goto retry;
>>>>>> +	}
>>>>>>
>>>>>> So it looks like a change to ct_send to me. Is that wrong?
>>>>
>>>> What about this part - is the patch changing the blocking ct_send or not,
>>>> and if it is why?
>>>>
>>>
>>> Yes, ct_send() changes. Sorry for the confusion.
>>>
>>> This function needs to be updated to account for the H2G space and
>>> backoff if no space is available.
>>
>> Since this one is the sleeping path, it probably can and needs to be smarter
>> than having a cond_resched busy loop added. Like sleep and get woken up when
>> there is space. Otherwise it can degenerate to busy looping via contention
>> with the non-blocking path.
>>
> 
> That screams over enginerring a simple problem to me. If the CT channel
> is full we are really in trouble anyways - i.e. the performance is going
> to terrible as we overwhelmed the GuC with traffic. That being said,
> IGTs can do this but that really isn't a real world use case. For the
> real world, this buffer is large enough that it won't ever be full hence
> the comment + lazy spin loop.
> 
> Next, it isn't like we get an interrupt or something when space
> becomes available so how would we wake this thread? Could we come up
> with a convoluted scheme where we insert ops that generated an interrupt
> at regular intervals, probably? Would it be super complicated, totally
> unnecessary, and gain use nothing - absolutely.
> 
> Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> submission code doesn't use these. I think SRIOV might, but those can
> probably be reworked too to use non-blocking. At some point we might
> want to scrub the driver and just delete the blocking path.

I guess the main problem is not with "blocking CTBs", as now only
calling thread is "blocked" waiting for reply and other threads can
still send their CTBs (blocked/nonblocking), but the fact that we are
sending too many messages, stopping only when CTB is full, and even then
trying hard to squeeze that message again.

it should be caller responsibility to throttle its stream of
non-blocking CTBs if either we are running out of CTB but if we have too
many "non-blocking" requests in flight.

making CTB buffer just larger and larger does not solve the problem,
only makes it less visible

and as you are using busy-loop to send even 'non-blocking' CTBs, it
might indicate that your code is not prepared to step-back in case of
any temporary CTB congestion

also note that currently all CTB messages are asynchronous, REQUEST /
RESPONSE pair could be processed in fully non-blocking approach, but
that would require refactoring of part driver into event-driven state
machine, as sometimes we can't move forward without information that we
are waiting from the GuC (and blocking was simplest solution for that)

but if your submission code is already  event-driven, then it should be
easier to trigger state machine into 'retry' mode without using this
busy-loop

> 
> Matt
> 
>> Regards,
> 
>>
>> Tvrtko
Matthew Brost June 9, 2021, 11:05 p.m. UTC | #15
On Wed, Jun 09, 2021 at 03:58:38PM +0200, Michal Wajdeczko wrote:
> 
> 
> On 08.06.2021 10:39, Tvrtko Ursulin wrote:
> > 
> > On 07/06/2021 18:31, Matthew Brost wrote:
> >> On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> >>>
> >>> On 27/05/2021 15:35, Matthew Brost wrote:
> >>>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> >>>>>
> >>>>> On 26/05/2021 19:10, Matthew Brost wrote:
> >>>>>
> >>>>> [snip]
> >>>>>
> >>>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> >>>>>>>>>> +              const u32 *action,
> >>>>>>>>>> +              u32 len,
> >>>>>>>>>> +              u32 flags)
> >>>>>>>>>> +{
> >>>>>>>>>> +    struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>>>> +    unsigned long spin_flags;
> >>>>>>>>>> +    u32 fence;
> >>>>>>>>>> +    int ret;
> >>>>>>>>>> +
> >>>>>>>>>> +    spin_lock_irqsave(&ctb->lock, spin_flags);
> >>>>>>>>>> +
> >>>>>>>>>> +    ret = ctb_has_room(ctb, len + 1);
> >>>>>>>>>> +    if (unlikely(ret))
> >>>>>>>>>> +        goto out;
> >>>>>>>>>> +
> >>>>>>>>>> +    fence = ct_get_next_fence(ct);
> >>>>>>>>>> +    ret = ct_write(ct, action, len, fence, flags);
> >>>>>>>>>> +    if (unlikely(ret))
> >>>>>>>>>> +        goto out;
> >>>>>>>>>> +
> >>>>>>>>>> +    intel_guc_notify(ct_to_guc(ct));
> >>>>>>>>>> +
> >>>>>>>>>> +out:
> >>>>>>>>>> +    spin_unlock_irqrestore(&ctb->lock, spin_flags);
> >>>>>>>>>> +
> >>>>>>>>>> +    return ret;
> >>>>>>>>>> +}
> >>>>>>>>>> +
> >>>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>>                  const u32 *action,
> >>>>>>>>>>                  u32 len,
> >>>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>>                  u32 response_buf_size,
> >>>>>>>>>>                  u32 *status)
> >>>>>>>>>>       {
> >>>>>>>>>> +    struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>>>>           struct ct_request request;
> >>>>>>>>>>           unsigned long flags;
> >>>>>>>>>>           u32 fence;
> >>>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>>           GEM_BUG_ON(!len);
> >>>>>>>>>>           GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >>>>>>>>>>           GEM_BUG_ON(!response_buf && response_buf_size);
> >>>>>>>>>> +    might_sleep();
> >>>>>>>>>
> >>>>>>>>> Sleep is just cond_resched below or there is more?
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> Yes, the cond_resched.
> >>>>>>>>
> >>>>>>>>>> +    /*
> >>>>>>>>>> +     * We use a lazy spin wait loop here as we believe that
> >>>>>>>>>> if the CT
> >>>>>>>>>> +     * buffers are sized correctly the flow control condition
> >>>>>>>>>> should be
> >>>>>>>>>> +     * rare.
> >>>>>>>>>> +     */
> >>>>>>>>>> +retry:
> >>>>>>>>>>           spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> >>>>>>>>>> +    if (unlikely(!ctb_has_room(ctb, len + 1))) {
> >>>>>>>>>> +        spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >>>>>>>>>> +        cond_resched();
> >>>>>>>>>> +        goto retry;
> >>>>>>>>>> +    }
> >>>>>>>>>
> >>>>>>>>> If this patch is about adding a non-blocking send function, and
> >>>>>>>>> below we can
> >>>>>>>>> see that it creates a fork:
> >>>>>>>>>
> >>>>>>>>> intel_guc_ct_send:
> >>>>>>>>> ...
> >>>>>>>>>     if (flags & INTEL_GUC_SEND_NB)
> >>>>>>>>>         return ct_send_nb(ct, action, len, flags);
> >>>>>>>>>
> >>>>>>>>>          ret = ct_send(ct, action, len, response_buf,
> >>>>>>>>> response_buf_size, &status);
> >>>>>>>>>
> >>>>>>>>> Then why is there a change in ct_send here, which is not the new
> >>>>>>>>> non-blocking path?
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> >>>>>>>
> >>>>>>> I was doing by the diff which says:
> >>>>>>>
> >>>>>>>     static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>                const u32 *action,
> >>>>>>>                u32 len,
> >>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>                u32 response_buf_size,
> >>>>>>>                u32 *status)
> >>>>>>>     {
> >>>>>>> +    struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>         struct ct_request request;
> >>>>>>>         unsigned long flags;
> >>>>>>>         u32 fence;
> >>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>         GEM_BUG_ON(!len);
> >>>>>>>         GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >>>>>>>         GEM_BUG_ON(!response_buf && response_buf_size);
> >>>>>>> +    might_sleep();
> >>>>>>> +    /*
> >>>>>>> +     * We use a lazy spin wait loop here as we believe that if
> >>>>>>> the CT
> >>>>>>> +     * buffers are sized correctly the flow control condition
> >>>>>>> should be
> >>>>>>> +     * rare.
> >>>>>>> +     */
> >>>>>>> +retry:
> >>>>>>>         spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> >>>>>>> +    if (unlikely(!ctb_has_room(ctb, len + 1))) {
> >>>>>>> +        spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >>>>>>> +        cond_resched();
> >>>>>>> +        goto retry;
> >>>>>>> +    }
> >>>>>>>
> >>>>>>> So it looks like a change to ct_send to me. Is that wrong?
> >>>>>
> >>>>> What about this part - is the patch changing the blocking ct_send
> >>>>> or not,
> >>>>> and if it is why?
> >>>>>
> >>>>
> >>>> Yes, ct_send() changes. Sorry for the confusion.
> >>>>
> >>>> This function needs to be updated to account for the H2G space and
> >>>> backoff if no space is available.
> >>>
> >>> Since this one is the sleeping path, it probably can and needs to be
> >>> smarter
> >>> than having a cond_resched busy loop added. Like sleep and get woken
> >>> up when
> >>> there is space. Otherwise it can degenerate to busy looping via
> >>> contention
> >>> with the non-blocking path.
> >>>
> >>
> >> That screams over enginerring a simple problem to me. If the CT channel
> >> is full we are really in trouble anyways - i.e. the performance is going
> >> to terrible as we overwhelmed the GuC with traffic. That being said,
> > 
> > Performance of what would be terrible? Something relating to submitting
> > new jobs to the GPU I guess. Or something SRIOV related as you hint below.
> > 
> > But there is no real reason why CPU cycles/power should suffer if GuC is
> > busy.
> > 
> > Okay, if it can't happen in real world then it's possibly passable as a
> 
> if that can't happen in real world, then maybe we can just return
> -ENOSPC/-EBUSY to report that 'unexpected' case, instead of hiding it
> behind silent busy loop ?
> 

No. This is a blocking call, hence it is ok for the function block if it
doesn't have space /w a timeout.

Matt

> > design of a communication interface. But to me it leaves a bad taste and
> > a doubt that there is this other aspect of the real world. And that is
> > when the unexpected happens. Even the most trivial things like a bug in
> > GuC firmware causes the driver to busy spin in there. So not much
> > happening on the machine but CPU cores pinned burning cycles in this
> > code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> > and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> > machine". Oh well..
> > 
> > At least I think the commit message should spell out clearly that a busy
> > looping path is being added to the sleeping send as a downside of
> > implementation choices. Still, for the record, I object to the design.
> > 
> > Regards,
> > 
> > Tvrtko
> > 
> >> IGTs can do this but that really isn't a real world use case. For the
> >> real world, this buffer is large enough that it won't ever be full hence
> >> the comment + lazy spin loop.
> >>
> >> Next, it isn't like we get an interrupt or something when space
> >> becomes available so how would we wake this thread? Could we come up
> >> with a convoluted scheme where we insert ops that generated an interrupt
> >> at regular intervals, probably? Would it be super complicated, totally
> >> unnecessary, and gain use nothing - absolutely.
> >>
> >> Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> >> submission code doesn't use these. I think SRIOV might, but those can
> >> probably be reworked too to use non-blocking. At some point we might
> >> want to scrub the driver and just delete the blocking path.
> >>
> >> Matt
> >>
> >>> Regards,
> >>
> >>>
> >>> Tvrtko
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Matthew Brost June 9, 2021, 11:10 p.m. UTC | #16
On Tue, Jun 08, 2021 at 10:46:15AM +0200, Daniel Vetter wrote:
> On Tue, Jun 8, 2021 at 10:39 AM Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
> >
> >
> > On 07/06/2021 18:31, Matthew Brost wrote:
> > > On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> > >>
> > >> On 27/05/2021 15:35, Matthew Brost wrote:
> > >>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> > >>>>
> > >>>> On 26/05/2021 19:10, Matthew Brost wrote:
> > >>>>
> > >>>> [snip]
> > >>>>
> > >>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> > >>>>>>>>> +                   const u32 *action,
> > >>>>>>>>> +                   u32 len,
> > >>>>>>>>> +                   u32 flags)
> > >>>>>>>>> +{
> > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > >>>>>>>>> +     unsigned long spin_flags;
> > >>>>>>>>> +     u32 fence;
> > >>>>>>>>> +     int ret;
> > >>>>>>>>> +
> > >>>>>>>>> +     spin_lock_irqsave(&ctb->lock, spin_flags);
> > >>>>>>>>> +
> > >>>>>>>>> +     ret = ctb_has_room(ctb, len + 1);
> > >>>>>>>>> +     if (unlikely(ret))
> > >>>>>>>>> +             goto out;
> > >>>>>>>>> +
> > >>>>>>>>> +     fence = ct_get_next_fence(ct);
> > >>>>>>>>> +     ret = ct_write(ct, action, len, fence, flags);
> > >>>>>>>>> +     if (unlikely(ret))
> > >>>>>>>>> +             goto out;
> > >>>>>>>>> +
> > >>>>>>>>> +     intel_guc_notify(ct_to_guc(ct));
> > >>>>>>>>> +
> > >>>>>>>>> +out:
> > >>>>>>>>> +     spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > >>>>>>>>> +
> > >>>>>>>>> +     return ret;
> > >>>>>>>>> +}
> > >>>>>>>>> +
> > >>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
> > >>>>>>>>>                          const u32 *action,
> > >>>>>>>>>                          u32 len,
> > >>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > >>>>>>>>>                          u32 response_buf_size,
> > >>>>>>>>>                          u32 *status)
> > >>>>>>>>>       {
> > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > >>>>>>>>>               struct ct_request request;
> > >>>>>>>>>               unsigned long flags;
> > >>>>>>>>>               u32 fence;
> > >>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > >>>>>>>>>               GEM_BUG_ON(!len);
> > >>>>>>>>>               GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > >>>>>>>>>               GEM_BUG_ON(!response_buf && response_buf_size);
> > >>>>>>>>> +     might_sleep();
> > >>>>>>>>
> > >>>>>>>> Sleep is just cond_resched below or there is more?
> > >>>>>>>>
> > >>>>>>>
> > >>>>>>> Yes, the cond_resched.
> > >>>>>>>
> > >>>>>>>>> +     /*
> > >>>>>>>>> +      * We use a lazy spin wait loop here as we believe that if the CT
> > >>>>>>>>> +      * buffers are sized correctly the flow control condition should be
> > >>>>>>>>> +      * rare.
> > >>>>>>>>> +      */
> > >>>>>>>>> +retry:
> > >>>>>>>>>               spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > >>>>>>>>> +     if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > >>>>>>>>> +             spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > >>>>>>>>> +             cond_resched();
> > >>>>>>>>> +             goto retry;
> > >>>>>>>>> +     }
> > >>>>>>>>
> > >>>>>>>> If this patch is about adding a non-blocking send function, and below we can
> > >>>>>>>> see that it creates a fork:
> > >>>>>>>>
> > >>>>>>>> intel_guc_ct_send:
> > >>>>>>>> ...
> > >>>>>>>>        if (flags & INTEL_GUC_SEND_NB)
> > >>>>>>>>                return ct_send_nb(ct, action, len, flags);
> > >>>>>>>>
> > >>>>>>>>        ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > >>>>>>>>
> > >>>>>>>> Then why is there a change in ct_send here, which is not the new
> > >>>>>>>> non-blocking path?
> > >>>>>>>>
> > >>>>>>>
> > >>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> > >>>>>>
> > >>>>>> I was doing by the diff which says:
> > >>>>>>
> > >>>>>>     static int ct_send(struct intel_guc_ct *ct,
> > >>>>>>                     const u32 *action,
> > >>>>>>                     u32 len,
> > >>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > >>>>>>                     u32 response_buf_size,
> > >>>>>>                     u32 *status)
> > >>>>>>     {
> > >>>>>> +        struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > >>>>>>          struct ct_request request;
> > >>>>>>          unsigned long flags;
> > >>>>>>          u32 fence;
> > >>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > >>>>>>          GEM_BUG_ON(!len);
> > >>>>>>          GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > >>>>>>          GEM_BUG_ON(!response_buf && response_buf_size);
> > >>>>>> +        might_sleep();
> > >>>>>> +        /*
> > >>>>>> +         * We use a lazy spin wait loop here as we believe that if the CT
> > >>>>>> +         * buffers are sized correctly the flow control condition should be
> > >>>>>> +         * rare.
> > >>>>>> +         */
> > >>>>>> +retry:
> > >>>>>>          spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > >>>>>> +        if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > >>>>>> +                spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > >>>>>> +                cond_resched();
> > >>>>>> +                goto retry;
> > >>>>>> +        }
> > >>>>>>
> > >>>>>> So it looks like a change to ct_send to me. Is that wrong?
> > >>>>
> > >>>> What about this part - is the patch changing the blocking ct_send or not,
> > >>>> and if it is why?
> > >>>>
> > >>>
> > >>> Yes, ct_send() changes. Sorry for the confusion.
> > >>>
> > >>> This function needs to be updated to account for the H2G space and
> > >>> backoff if no space is available.
> > >>
> > >> Since this one is the sleeping path, it probably can and needs to be smarter
> > >> than having a cond_resched busy loop added. Like sleep and get woken up when
> > >> there is space. Otherwise it can degenerate to busy looping via contention
> > >> with the non-blocking path.
> > >>
> > >
> > > That screams over enginerring a simple problem to me. If the CT channel
> > > is full we are really in trouble anyways - i.e. the performance is going
> > > to terrible as we overwhelmed the GuC with traffic. That being said,
> >
> > Performance of what would be terrible? Something relating to submitting
> > new jobs to the GPU I guess. Or something SRIOV related as you hint below.
> >
> > But there is no real reason why CPU cycles/power should suffer if GuC is
> > busy.
> >
> > Okay, if it can't happen in real world then it's possibly passable as a
> > design of a communication interface. But to me it leaves a bad taste and
> > a doubt that there is this other aspect of the real world. And that is
> > when the unexpected happens. Even the most trivial things like a bug in
> > GuC firmware causes the driver to busy spin in there. So not much
> > happening on the machine but CPU cores pinned burning cycles in this
> > code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> > and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> > machine". Oh well..
> >
> > At least I think the commit message should spell out clearly that a busy
> > looping path is being added to the sleeping send as a downside of
> > implementation choices. Still, for the record, I object to the design.
> >
> > > IGTs can do this but that really isn't a real world use case. For the
> > > real world, this buffer is large enough that it won't ever be full hence
> > > the comment + lazy spin loop.
> > >
> > > Next, it isn't like we get an interrupt or something when space
> > > becomes available so how would we wake this thread? Could we come up
> > > with a convoluted scheme where we insert ops that generated an interrupt
> > > at regular intervals, probably? Would it be super complicated, totally
> > > unnecessary, and gain use nothing - absolutely.
> > >
> > > Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> > > submission code doesn't use these. I think SRIOV might, but those can
> > > probably be reworked too to use non-blocking. At some point we might
> > > want to scrub the driver and just delete the blocking path.
> 
> I'd do an s/cond_resched()/msleep(1)/ and comment explaining why we
> just don't care about this. That checks of the cpu wasting in this
> case (GuC is overloaded, it wont come back anytime soon anyway) and
> explains why we really don't want to make this any more clever or
> complex code (because comment can explain why we wont hit this in
> actual real world usage except when something else is on fire already
> anyway).
> 

Sounds good.

> If you want to go absolutely overkill and it's not too much work, make
> the msleep interruptible or check for signals, and bail out. That way
> the process can be made unstuck with ^C at least.

This loop is already bound by a timer and if no forward progress is made
we pop out of this loop. It is assumed if this happens the GuC / GPU is
dead a and full GPU reset will have to be issued. A following patch
adds the timer, a bit later in submission section of the series a patch
is added to trigger the reset.

Matt

> -Daniel
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Matthew Brost June 9, 2021, 11:13 p.m. UTC | #17
On Wed, Jun 09, 2021 at 04:14:05PM +0200, Michal Wajdeczko wrote:
> 
> 
> On 07.06.2021 19:31, Matthew Brost wrote:
> > On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> >>
> >> On 27/05/2021 15:35, Matthew Brost wrote:
> >>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> >>>>
> >>>> On 26/05/2021 19:10, Matthew Brost wrote:
> >>>>
> >>>> [snip]
> >>>>
> >>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> >>>>>>>>> +		      const u32 *action,
> >>>>>>>>> +		      u32 len,
> >>>>>>>>> +		      u32 flags)
> >>>>>>>>> +{
> >>>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>>> +	unsigned long spin_flags;
> >>>>>>>>> +	u32 fence;
> >>>>>>>>> +	int ret;
> >>>>>>>>> +
> >>>>>>>>> +	spin_lock_irqsave(&ctb->lock, spin_flags);
> >>>>>>>>> +
> >>>>>>>>> +	ret = ctb_has_room(ctb, len + 1);
> >>>>>>>>> +	if (unlikely(ret))
> >>>>>>>>> +		goto out;
> >>>>>>>>> +
> >>>>>>>>> +	fence = ct_get_next_fence(ct);
> >>>>>>>>> +	ret = ct_write(ct, action, len, fence, flags);
> >>>>>>>>> +	if (unlikely(ret))
> >>>>>>>>> +		goto out;
> >>>>>>>>> +
> >>>>>>>>> +	intel_guc_notify(ct_to_guc(ct));
> >>>>>>>>> +
> >>>>>>>>> +out:
> >>>>>>>>> +	spin_unlock_irqrestore(&ctb->lock, spin_flags);
> >>>>>>>>> +
> >>>>>>>>> +	return ret;
> >>>>>>>>> +}
> >>>>>>>>> +
> >>>>>>>>>      static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>      		   const u32 *action,
> >>>>>>>>>      		   u32 len,
> >>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>      		   u32 response_buf_size,
> >>>>>>>>>      		   u32 *status)
> >>>>>>>>>      {
> >>>>>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>>>>      	struct ct_request request;
> >>>>>>>>>      	unsigned long flags;
> >>>>>>>>>      	u32 fence;
> >>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>>>>      	GEM_BUG_ON(!len);
> >>>>>>>>>      	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >>>>>>>>>      	GEM_BUG_ON(!response_buf && response_buf_size);
> >>>>>>>>> +	might_sleep();
> >>>>>>>>
> >>>>>>>> Sleep is just cond_resched below or there is more?
> >>>>>>>>
> >>>>>>>
> >>>>>>> Yes, the cond_resched.
> >>>>>>>
> >>>>>>>>> +	/*
> >>>>>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
> >>>>>>>>> +	 * buffers are sized correctly the flow control condition should be
> >>>>>>>>> +	 * rare.
> >>>>>>>>> +	 */
> >>>>>>>>> +retry:
> >>>>>>>>>      	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> >>>>>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> >>>>>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >>>>>>>>> +		cond_resched();
> >>>>>>>>> +		goto retry;
> >>>>>>>>> +	}
> >>>>>>>>
> >>>>>>>> If this patch is about adding a non-blocking send function, and below we can
> >>>>>>>> see that it creates a fork:
> >>>>>>>>
> >>>>>>>> intel_guc_ct_send:
> >>>>>>>> ...
> >>>>>>>> 	if (flags & INTEL_GUC_SEND_NB)
> >>>>>>>> 		return ct_send_nb(ct, action, len, flags);
> >>>>>>>>
> >>>>>>>>     	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> >>>>>>>>
> >>>>>>>> Then why is there a change in ct_send here, which is not the new
> >>>>>>>> non-blocking path?
> >>>>>>>>
> >>>>>>>
> >>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> >>>>>>
> >>>>>> I was doing by the diff which says:
> >>>>>>
> >>>>>>    static int ct_send(struct intel_guc_ct *ct,
> >>>>>>    		   const u32 *action,
> >>>>>>    		   u32 len,
> >>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>    		   u32 response_buf_size,
> >>>>>>    		   u32 *status)
> >>>>>>    {
> >>>>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> >>>>>>    	struct ct_request request;
> >>>>>>    	unsigned long flags;
> >>>>>>    	u32 fence;
> >>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> >>>>>>    	GEM_BUG_ON(!len);
> >>>>>>    	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> >>>>>>    	GEM_BUG_ON(!response_buf && response_buf_size);
> >>>>>> +	might_sleep();
> >>>>>> +	/*
> >>>>>> +	 * We use a lazy spin wait loop here as we believe that if the CT
> >>>>>> +	 * buffers are sized correctly the flow control condition should be
> >>>>>> +	 * rare.
> >>>>>> +	 */
> >>>>>> +retry:
> >>>>>>    	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> >>>>>> +	if (unlikely(!ctb_has_room(ctb, len + 1))) {
> >>>>>> +		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> >>>>>> +		cond_resched();
> >>>>>> +		goto retry;
> >>>>>> +	}
> >>>>>>
> >>>>>> So it looks like a change to ct_send to me. Is that wrong?
> >>>>
> >>>> What about this part - is the patch changing the blocking ct_send or not,
> >>>> and if it is why?
> >>>>
> >>>
> >>> Yes, ct_send() changes. Sorry for the confusion.
> >>>
> >>> This function needs to be updated to account for the H2G space and
> >>> backoff if no space is available.
> >>
> >> Since this one is the sleeping path, it probably can and needs to be smarter
> >> than having a cond_resched busy loop added. Like sleep and get woken up when
> >> there is space. Otherwise it can degenerate to busy looping via contention
> >> with the non-blocking path.
> >>
> > 
> > That screams over enginerring a simple problem to me. If the CT channel
> > is full we are really in trouble anyways - i.e. the performance is going
> > to terrible as we overwhelmed the GuC with traffic. That being said,
> > IGTs can do this but that really isn't a real world use case. For the
> > real world, this buffer is large enough that it won't ever be full hence
> > the comment + lazy spin loop.
> > 
> > Next, it isn't like we get an interrupt or something when space
> > becomes available so how would we wake this thread? Could we come up
> > with a convoluted scheme where we insert ops that generated an interrupt
> > at regular intervals, probably? Would it be super complicated, totally
> > unnecessary, and gain use nothing - absolutely.
> > 
> > Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> > submission code doesn't use these. I think SRIOV might, but those can
> > probably be reworked too to use non-blocking. At some point we might
> > want to scrub the driver and just delete the blocking path.
> 
> I guess the main problem is not with "blocking CTBs", as now only
> calling thread is "blocked" waiting for reply and other threads can
> still send their CTBs (blocked/nonblocking), but the fact that we are
> sending too many messages, stopping only when CTB is full, and even then
> trying hard to squeeze that message again.
> 
> it should be caller responsibility to throttle its stream of
> non-blocking CTBs if either we are running out of CTB but if we have too
> many "non-blocking" requests in flight.
> 
> making CTB buffer just larger and larger does not solve the problem,
> only makes it less visible
> 
> and as you are using busy-loop to send even 'non-blocking' CTBs, it
> might indicate that your code is not prepared to step-back in case of
> any temporary CTB congestion
> 
> also note that currently all CTB messages are asynchronous, REQUEST /
> RESPONSE pair could be processed in fully non-blocking approach, but
> that would require refactoring of part driver into event-driven state
> machine, as sometimes we can't move forward without information that we
> are waiting from the GuC (and blocking was simplest solution for that)
> 
> but if your submission code is already  event-driven, then it should be
> easier to trigger state machine into 'retry' mode without using this
> busy-loop

Yes, the state-machine is used in most cases as a back off where it
makes sense. Some cases we still just use a busy-loop. See my comments
about over engineering solutions - sometimes it is better to use
something simple for something that rare.

Matt

> 
> > 
> > Matt
> > 
> >> Regards,
> > 
> >>
> >> Tvrtko
Daniel Vetter June 10, 2021, 3:27 p.m. UTC | #18
On Wed, Jun 09, 2021 at 04:10:23PM -0700, Matthew Brost wrote:
> On Tue, Jun 08, 2021 at 10:46:15AM +0200, Daniel Vetter wrote:
> > On Tue, Jun 8, 2021 at 10:39 AM Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com> wrote:
> > >
> > >
> > > On 07/06/2021 18:31, Matthew Brost wrote:
> > > > On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> > > >>
> > > >> On 27/05/2021 15:35, Matthew Brost wrote:
> > > >>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> > > >>>>
> > > >>>> On 26/05/2021 19:10, Matthew Brost wrote:
> > > >>>>
> > > >>>> [snip]
> > > >>>>
> > > >>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> > > >>>>>>>>> +                   const u32 *action,
> > > >>>>>>>>> +                   u32 len,
> > > >>>>>>>>> +                   u32 flags)
> > > >>>>>>>>> +{
> > > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > >>>>>>>>> +     unsigned long spin_flags;
> > > >>>>>>>>> +     u32 fence;
> > > >>>>>>>>> +     int ret;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     spin_lock_irqsave(&ctb->lock, spin_flags);
> > > >>>>>>>>> +
> > > >>>>>>>>> +     ret = ctb_has_room(ctb, len + 1);
> > > >>>>>>>>> +     if (unlikely(ret))
> > > >>>>>>>>> +             goto out;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     fence = ct_get_next_fence(ct);
> > > >>>>>>>>> +     ret = ct_write(ct, action, len, fence, flags);
> > > >>>>>>>>> +     if (unlikely(ret))
> > > >>>>>>>>> +             goto out;
> > > >>>>>>>>> +
> > > >>>>>>>>> +     intel_guc_notify(ct_to_guc(ct));
> > > >>>>>>>>> +
> > > >>>>>>>>> +out:
> > > >>>>>>>>> +     spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > > >>>>>>>>> +
> > > >>>>>>>>> +     return ret;
> > > >>>>>>>>> +}
> > > >>>>>>>>> +
> > > >>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
> > > >>>>>>>>>                          const u32 *action,
> > > >>>>>>>>>                          u32 len,
> > > >>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >>>>>>>>>                          u32 response_buf_size,
> > > >>>>>>>>>                          u32 *status)
> > > >>>>>>>>>       {
> > > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > >>>>>>>>>               struct ct_request request;
> > > >>>>>>>>>               unsigned long flags;
> > > >>>>>>>>>               u32 fence;
> > > >>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >>>>>>>>>               GEM_BUG_ON(!len);
> > > >>>>>>>>>               GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > >>>>>>>>>               GEM_BUG_ON(!response_buf && response_buf_size);
> > > >>>>>>>>> +     might_sleep();
> > > >>>>>>>>
> > > >>>>>>>> Sleep is just cond_resched below or there is more?
> > > >>>>>>>>
> > > >>>>>>>
> > > >>>>>>> Yes, the cond_resched.
> > > >>>>>>>
> > > >>>>>>>>> +     /*
> > > >>>>>>>>> +      * We use a lazy spin wait loop here as we believe that if the CT
> > > >>>>>>>>> +      * buffers are sized correctly the flow control condition should be
> > > >>>>>>>>> +      * rare.
> > > >>>>>>>>> +      */
> > > >>>>>>>>> +retry:
> > > >>>>>>>>>               spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > >>>>>>>>> +     if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > >>>>>>>>> +             spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > >>>>>>>>> +             cond_resched();
> > > >>>>>>>>> +             goto retry;
> > > >>>>>>>>> +     }
> > > >>>>>>>>
> > > >>>>>>>> If this patch is about adding a non-blocking send function, and below we can
> > > >>>>>>>> see that it creates a fork:
> > > >>>>>>>>
> > > >>>>>>>> intel_guc_ct_send:
> > > >>>>>>>> ...
> > > >>>>>>>>        if (flags & INTEL_GUC_SEND_NB)
> > > >>>>>>>>                return ct_send_nb(ct, action, len, flags);
> > > >>>>>>>>
> > > >>>>>>>>        ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > >>>>>>>>
> > > >>>>>>>> Then why is there a change in ct_send here, which is not the new
> > > >>>>>>>> non-blocking path?
> > > >>>>>>>>
> > > >>>>>>>
> > > >>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> > > >>>>>>
> > > >>>>>> I was doing by the diff which says:
> > > >>>>>>
> > > >>>>>>     static int ct_send(struct intel_guc_ct *ct,
> > > >>>>>>                     const u32 *action,
> > > >>>>>>                     u32 len,
> > > >>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >>>>>>                     u32 response_buf_size,
> > > >>>>>>                     u32 *status)
> > > >>>>>>     {
> > > >>>>>> +        struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > >>>>>>          struct ct_request request;
> > > >>>>>>          unsigned long flags;
> > > >>>>>>          u32 fence;
> > > >>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > >>>>>>          GEM_BUG_ON(!len);
> > > >>>>>>          GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > >>>>>>          GEM_BUG_ON(!response_buf && response_buf_size);
> > > >>>>>> +        might_sleep();
> > > >>>>>> +        /*
> > > >>>>>> +         * We use a lazy spin wait loop here as we believe that if the CT
> > > >>>>>> +         * buffers are sized correctly the flow control condition should be
> > > >>>>>> +         * rare.
> > > >>>>>> +         */
> > > >>>>>> +retry:
> > > >>>>>>          spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > >>>>>> +        if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > >>>>>> +                spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > >>>>>> +                cond_resched();
> > > >>>>>> +                goto retry;
> > > >>>>>> +        }
> > > >>>>>>
> > > >>>>>> So it looks like a change to ct_send to me. Is that wrong?
> > > >>>>
> > > >>>> What about this part - is the patch changing the blocking ct_send or not,
> > > >>>> and if it is why?
> > > >>>>
> > > >>>
> > > >>> Yes, ct_send() changes. Sorry for the confusion.
> > > >>>
> > > >>> This function needs to be updated to account for the H2G space and
> > > >>> backoff if no space is available.
> > > >>
> > > >> Since this one is the sleeping path, it probably can and needs to be smarter
> > > >> than having a cond_resched busy loop added. Like sleep and get woken up when
> > > >> there is space. Otherwise it can degenerate to busy looping via contention
> > > >> with the non-blocking path.
> > > >>
> > > >
> > > > That screams over enginerring a simple problem to me. If the CT channel
> > > > is full we are really in trouble anyways - i.e. the performance is going
> > > > to terrible as we overwhelmed the GuC with traffic. That being said,
> > >
> > > Performance of what would be terrible? Something relating to submitting
> > > new jobs to the GPU I guess. Or something SRIOV related as you hint below.
> > >
> > > But there is no real reason why CPU cycles/power should suffer if GuC is
> > > busy.
> > >
> > > Okay, if it can't happen in real world then it's possibly passable as a
> > > design of a communication interface. But to me it leaves a bad taste and
> > > a doubt that there is this other aspect of the real world. And that is
> > > when the unexpected happens. Even the most trivial things like a bug in
> > > GuC firmware causes the driver to busy spin in there. So not much
> > > happening on the machine but CPU cores pinned burning cycles in this
> > > code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> > > and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> > > machine". Oh well..
> > >
> > > At least I think the commit message should spell out clearly that a busy
> > > looping path is being added to the sleeping send as a downside of
> > > implementation choices. Still, for the record, I object to the design.
> > >
> > > > IGTs can do this but that really isn't a real world use case. For the
> > > > real world, this buffer is large enough that it won't ever be full hence
> > > > the comment + lazy spin loop.
> > > >
> > > > Next, it isn't like we get an interrupt or something when space
> > > > becomes available so how would we wake this thread? Could we come up
> > > > with a convoluted scheme where we insert ops that generated an interrupt
> > > > at regular intervals, probably? Would it be super complicated, totally
> > > > unnecessary, and gain use nothing - absolutely.
> > > >
> > > > Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> > > > submission code doesn't use these. I think SRIOV might, but those can
> > > > probably be reworked too to use non-blocking. At some point we might
> > > > want to scrub the driver and just delete the blocking path.
> > 
> > I'd do an s/cond_resched()/msleep(1)/ and comment explaining why we
> > just don't care about this. That checks of the cpu wasting in this
> > case (GuC is overloaded, it wont come back anytime soon anyway) and
> > explains why we really don't want to make this any more clever or
> > complex code (because comment can explain why we wont hit this in
> > actual real world usage except when something else is on fire already
> > anyway).
> > 
> 
> Sounds good.
> 
> > If you want to go absolutely overkill and it's not too much work, make
> > the msleep interruptible or check for signals, and bail out. That way
> > the process can be made unstuck with ^C at least.
> 
> This loop is already bound by a timer and if no forward progress is made
> we pop out of this loop. It is assumed if this happens the GuC / GPU is
> dead a and full GPU reset will have to be issued. A following patch
> adds the timer, a bit later in submission section of the series a patch
> is added to trigger the reset.

Yeah timeout bail-out works too, and if you then switch it from timeout to
also interruptible it shouldn't be much more code. It's just nice to not
have any uninterruptible sleep.
-Daniel
Matthew Brost June 24, 2021, 4:38 p.m. UTC | #19
On Thu, Jun 10, 2021 at 05:27:48PM +0200, Daniel Vetter wrote:
> On Wed, Jun 09, 2021 at 04:10:23PM -0700, Matthew Brost wrote:
> > On Tue, Jun 08, 2021 at 10:46:15AM +0200, Daniel Vetter wrote:
> > > On Tue, Jun 8, 2021 at 10:39 AM Tvrtko Ursulin
> > > <tvrtko.ursulin@linux.intel.com> wrote:
> > > >
> > > >
> > > > On 07/06/2021 18:31, Matthew Brost wrote:
> > > > > On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> > > > >>
> > > > >> On 27/05/2021 15:35, Matthew Brost wrote:
> > > > >>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> > > > >>>>
> > > > >>>> On 26/05/2021 19:10, Matthew Brost wrote:
> > > > >>>>
> > > > >>>> [snip]
> > > > >>>>
> > > > >>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> > > > >>>>>>>>> +                   const u32 *action,
> > > > >>>>>>>>> +                   u32 len,
> > > > >>>>>>>>> +                   u32 flags)
> > > > >>>>>>>>> +{
> > > > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > >>>>>>>>> +     unsigned long spin_flags;
> > > > >>>>>>>>> +     u32 fence;
> > > > >>>>>>>>> +     int ret;
> > > > >>>>>>>>> +
> > > > >>>>>>>>> +     spin_lock_irqsave(&ctb->lock, spin_flags);
> > > > >>>>>>>>> +
> > > > >>>>>>>>> +     ret = ctb_has_room(ctb, len + 1);
> > > > >>>>>>>>> +     if (unlikely(ret))
> > > > >>>>>>>>> +             goto out;
> > > > >>>>>>>>> +
> > > > >>>>>>>>> +     fence = ct_get_next_fence(ct);
> > > > >>>>>>>>> +     ret = ct_write(ct, action, len, fence, flags);
> > > > >>>>>>>>> +     if (unlikely(ret))
> > > > >>>>>>>>> +             goto out;
> > > > >>>>>>>>> +
> > > > >>>>>>>>> +     intel_guc_notify(ct_to_guc(ct));
> > > > >>>>>>>>> +
> > > > >>>>>>>>> +out:
> > > > >>>>>>>>> +     spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > > > >>>>>>>>> +
> > > > >>>>>>>>> +     return ret;
> > > > >>>>>>>>> +}
> > > > >>>>>>>>> +
> > > > >>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
> > > > >>>>>>>>>                          const u32 *action,
> > > > >>>>>>>>>                          u32 len,
> > > > >>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > >>>>>>>>>                          u32 response_buf_size,
> > > > >>>>>>>>>                          u32 *status)
> > > > >>>>>>>>>       {
> > > > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > >>>>>>>>>               struct ct_request request;
> > > > >>>>>>>>>               unsigned long flags;
> > > > >>>>>>>>>               u32 fence;
> > > > >>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > >>>>>>>>>               GEM_BUG_ON(!len);
> > > > >>>>>>>>>               GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > >>>>>>>>>               GEM_BUG_ON(!response_buf && response_buf_size);
> > > > >>>>>>>>> +     might_sleep();
> > > > >>>>>>>>
> > > > >>>>>>>> Sleep is just cond_resched below or there is more?
> > > > >>>>>>>>
> > > > >>>>>>>
> > > > >>>>>>> Yes, the cond_resched.
> > > > >>>>>>>
> > > > >>>>>>>>> +     /*
> > > > >>>>>>>>> +      * We use a lazy spin wait loop here as we believe that if the CT
> > > > >>>>>>>>> +      * buffers are sized correctly the flow control condition should be
> > > > >>>>>>>>> +      * rare.
> > > > >>>>>>>>> +      */
> > > > >>>>>>>>> +retry:
> > > > >>>>>>>>>               spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > >>>>>>>>> +     if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > >>>>>>>>> +             spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > >>>>>>>>> +             cond_resched();
> > > > >>>>>>>>> +             goto retry;
> > > > >>>>>>>>> +     }
> > > > >>>>>>>>
> > > > >>>>>>>> If this patch is about adding a non-blocking send function, and below we can
> > > > >>>>>>>> see that it creates a fork:
> > > > >>>>>>>>
> > > > >>>>>>>> intel_guc_ct_send:
> > > > >>>>>>>> ...
> > > > >>>>>>>>        if (flags & INTEL_GUC_SEND_NB)
> > > > >>>>>>>>                return ct_send_nb(ct, action, len, flags);
> > > > >>>>>>>>
> > > > >>>>>>>>        ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > > >>>>>>>>
> > > > >>>>>>>> Then why is there a change in ct_send here, which is not the new
> > > > >>>>>>>> non-blocking path?
> > > > >>>>>>>>
> > > > >>>>>>>
> > > > >>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> > > > >>>>>>
> > > > >>>>>> I was doing by the diff which says:
> > > > >>>>>>
> > > > >>>>>>     static int ct_send(struct intel_guc_ct *ct,
> > > > >>>>>>                     const u32 *action,
> > > > >>>>>>                     u32 len,
> > > > >>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > >>>>>>                     u32 response_buf_size,
> > > > >>>>>>                     u32 *status)
> > > > >>>>>>     {
> > > > >>>>>> +        struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > >>>>>>          struct ct_request request;
> > > > >>>>>>          unsigned long flags;
> > > > >>>>>>          u32 fence;
> > > > >>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > >>>>>>          GEM_BUG_ON(!len);
> > > > >>>>>>          GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > >>>>>>          GEM_BUG_ON(!response_buf && response_buf_size);
> > > > >>>>>> +        might_sleep();
> > > > >>>>>> +        /*
> > > > >>>>>> +         * We use a lazy spin wait loop here as we believe that if the CT
> > > > >>>>>> +         * buffers are sized correctly the flow control condition should be
> > > > >>>>>> +         * rare.
> > > > >>>>>> +         */
> > > > >>>>>> +retry:
> > > > >>>>>>          spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > >>>>>> +        if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > >>>>>> +                spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > >>>>>> +                cond_resched();
> > > > >>>>>> +                goto retry;
> > > > >>>>>> +        }
> > > > >>>>>>
> > > > >>>>>> So it looks like a change to ct_send to me. Is that wrong?
> > > > >>>>
> > > > >>>> What about this part - is the patch changing the blocking ct_send or not,
> > > > >>>> and if it is why?
> > > > >>>>
> > > > >>>
> > > > >>> Yes, ct_send() changes. Sorry for the confusion.
> > > > >>>
> > > > >>> This function needs to be updated to account for the H2G space and
> > > > >>> backoff if no space is available.
> > > > >>
> > > > >> Since this one is the sleeping path, it probably can and needs to be smarter
> > > > >> than having a cond_resched busy loop added. Like sleep and get woken up when
> > > > >> there is space. Otherwise it can degenerate to busy looping via contention
> > > > >> with the non-blocking path.
> > > > >>
> > > > >
> > > > > That screams over enginerring a simple problem to me. If the CT channel
> > > > > is full we are really in trouble anyways - i.e. the performance is going
> > > > > to terrible as we overwhelmed the GuC with traffic. That being said,
> > > >
> > > > Performance of what would be terrible? Something relating to submitting
> > > > new jobs to the GPU I guess. Or something SRIOV related as you hint below.
> > > >
> > > > But there is no real reason why CPU cycles/power should suffer if GuC is
> > > > busy.
> > > >
> > > > Okay, if it can't happen in real world then it's possibly passable as a
> > > > design of a communication interface. But to me it leaves a bad taste and
> > > > a doubt that there is this other aspect of the real world. And that is
> > > > when the unexpected happens. Even the most trivial things like a bug in
> > > > GuC firmware causes the driver to busy spin in there. So not much
> > > > happening on the machine but CPU cores pinned burning cycles in this
> > > > code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> > > > and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> > > > machine". Oh well..
> > > >
> > > > At least I think the commit message should spell out clearly that a busy
> > > > looping path is being added to the sleeping send as a downside of
> > > > implementation choices. Still, for the record, I object to the design.
> > > >
> > > > > IGTs can do this but that really isn't a real world use case. For the
> > > > > real world, this buffer is large enough that it won't ever be full hence
> > > > > the comment + lazy spin loop.
> > > > >
> > > > > Next, it isn't like we get an interrupt or something when space
> > > > > becomes available so how would we wake this thread? Could we come up
> > > > > with a convoluted scheme where we insert ops that generated an interrupt
> > > > > at regular intervals, probably? Would it be super complicated, totally
> > > > > unnecessary, and gain use nothing - absolutely.
> > > > >
> > > > > Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> > > > > submission code doesn't use these. I think SRIOV might, but those can
> > > > > probably be reworked too to use non-blocking. At some point we might
> > > > > want to scrub the driver and just delete the blocking path.
> > > 
> > > I'd do an s/cond_resched()/msleep(1)/ and comment explaining why we
> > > just don't care about this. That checks of the cpu wasting in this
> > > case (GuC is overloaded, it wont come back anytime soon anyway) and
> > > explains why we really don't want to make this any more clever or
> > > complex code (because comment can explain why we wont hit this in
> > > actual real world usage except when something else is on fire already
> > > anyway).
> > > 
> > 
> > Sounds good.
> > 
> > > If you want to go absolutely overkill and it's not too much work, make
> > > the msleep interruptible or check for signals, and bail out. That way
> > > the process can be made unstuck with ^C at least.
> > 
> > This loop is already bound by a timer and if no forward progress is made
> > we pop out of this loop. It is assumed if this happens the GuC / GPU is
> > dead a and full GPU reset will have to be issued. A following patch
> > adds the timer, a bit later in submission section of the series a patch
> > is added to trigger the reset.
> 
> Yeah timeout bail-out works too, and if you then switch it from timeout to
> also interruptible it shouldn't be much more code. It's just nice to not
> have any uninterruptible sleep.
> -Daniel

I didn't get this in my next rev as I didn't know how to do this off
hand but I think all I need to add is something like this to each
iteration of the busy loops, right?

if (signal_pending_state(TASK_INTERRUPTIBLE, current))
	bail out of busy loop, and return and error

Matt

> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter June 24, 2021, 5:25 p.m. UTC | #20
On Thu, Jun 24, 2021 at 09:38:33AM -0700, Matthew Brost wrote:
> On Thu, Jun 10, 2021 at 05:27:48PM +0200, Daniel Vetter wrote:
> > On Wed, Jun 09, 2021 at 04:10:23PM -0700, Matthew Brost wrote:
> > > On Tue, Jun 08, 2021 at 10:46:15AM +0200, Daniel Vetter wrote:
> > > > On Tue, Jun 8, 2021 at 10:39 AM Tvrtko Ursulin
> > > > <tvrtko.ursulin@linux.intel.com> wrote:
> > > > >
> > > > >
> > > > > On 07/06/2021 18:31, Matthew Brost wrote:
> > > > > > On Thu, May 27, 2021 at 04:11:50PM +0100, Tvrtko Ursulin wrote:
> > > > > >>
> > > > > >> On 27/05/2021 15:35, Matthew Brost wrote:
> > > > > >>> On Thu, May 27, 2021 at 11:02:24AM +0100, Tvrtko Ursulin wrote:
> > > > > >>>>
> > > > > >>>> On 26/05/2021 19:10, Matthew Brost wrote:
> > > > > >>>>
> > > > > >>>> [snip]
> > > > > >>>>
> > > > > >>>>>>>>> +static int ct_send_nb(struct intel_guc_ct *ct,
> > > > > >>>>>>>>> +                   const u32 *action,
> > > > > >>>>>>>>> +                   u32 len,
> > > > > >>>>>>>>> +                   u32 flags)
> > > > > >>>>>>>>> +{
> > > > > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > >>>>>>>>> +     unsigned long spin_flags;
> > > > > >>>>>>>>> +     u32 fence;
> > > > > >>>>>>>>> +     int ret;
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>> +     spin_lock_irqsave(&ctb->lock, spin_flags);
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>> +     ret = ctb_has_room(ctb, len + 1);
> > > > > >>>>>>>>> +     if (unlikely(ret))
> > > > > >>>>>>>>> +             goto out;
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>> +     fence = ct_get_next_fence(ct);
> > > > > >>>>>>>>> +     ret = ct_write(ct, action, len, fence, flags);
> > > > > >>>>>>>>> +     if (unlikely(ret))
> > > > > >>>>>>>>> +             goto out;
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>> +     intel_guc_notify(ct_to_guc(ct));
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>> +out:
> > > > > >>>>>>>>> +     spin_unlock_irqrestore(&ctb->lock, spin_flags);
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>> +     return ret;
> > > > > >>>>>>>>> +}
> > > > > >>>>>>>>> +
> > > > > >>>>>>>>>       static int ct_send(struct intel_guc_ct *ct,
> > > > > >>>>>>>>>                          const u32 *action,
> > > > > >>>>>>>>>                          u32 len,
> > > > > >>>>>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >>>>>>>>>                          u32 response_buf_size,
> > > > > >>>>>>>>>                          u32 *status)
> > > > > >>>>>>>>>       {
> > > > > >>>>>>>>> +     struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > >>>>>>>>>               struct ct_request request;
> > > > > >>>>>>>>>               unsigned long flags;
> > > > > >>>>>>>>>               u32 fence;
> > > > > >>>>>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >>>>>>>>>               GEM_BUG_ON(!len);
> > > > > >>>>>>>>>               GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > > >>>>>>>>>               GEM_BUG_ON(!response_buf && response_buf_size);
> > > > > >>>>>>>>> +     might_sleep();
> > > > > >>>>>>>>
> > > > > >>>>>>>> Sleep is just cond_resched below or there is more?
> > > > > >>>>>>>>
> > > > > >>>>>>>
> > > > > >>>>>>> Yes, the cond_resched.
> > > > > >>>>>>>
> > > > > >>>>>>>>> +     /*
> > > > > >>>>>>>>> +      * We use a lazy spin wait loop here as we believe that if the CT
> > > > > >>>>>>>>> +      * buffers are sized correctly the flow control condition should be
> > > > > >>>>>>>>> +      * rare.
> > > > > >>>>>>>>> +      */
> > > > > >>>>>>>>> +retry:
> > > > > >>>>>>>>>               spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > > >>>>>>>>> +     if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > > >>>>>>>>> +             spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > > >>>>>>>>> +             cond_resched();
> > > > > >>>>>>>>> +             goto retry;
> > > > > >>>>>>>>> +     }
> > > > > >>>>>>>>
> > > > > >>>>>>>> If this patch is about adding a non-blocking send function, and below we can
> > > > > >>>>>>>> see that it creates a fork:
> > > > > >>>>>>>>
> > > > > >>>>>>>> intel_guc_ct_send:
> > > > > >>>>>>>> ...
> > > > > >>>>>>>>        if (flags & INTEL_GUC_SEND_NB)
> > > > > >>>>>>>>                return ct_send_nb(ct, action, len, flags);
> > > > > >>>>>>>>
> > > > > >>>>>>>>        ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
> > > > > >>>>>>>>
> > > > > >>>>>>>> Then why is there a change in ct_send here, which is not the new
> > > > > >>>>>>>> non-blocking path?
> > > > > >>>>>>>>
> > > > > >>>>>>>
> > > > > >>>>>>> There is not a change to ct_send(), just to intel_guc_ct_send.
> > > > > >>>>>>
> > > > > >>>>>> I was doing by the diff which says:
> > > > > >>>>>>
> > > > > >>>>>>     static int ct_send(struct intel_guc_ct *ct,
> > > > > >>>>>>                     const u32 *action,
> > > > > >>>>>>                     u32 len,
> > > > > >>>>>> @@ -473,6 +541,7 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >>>>>>                     u32 response_buf_size,
> > > > > >>>>>>                     u32 *status)
> > > > > >>>>>>     {
> > > > > >>>>>> +        struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> > > > > >>>>>>          struct ct_request request;
> > > > > >>>>>>          unsigned long flags;
> > > > > >>>>>>          u32 fence;
> > > > > >>>>>> @@ -482,8 +551,20 @@ static int ct_send(struct intel_guc_ct *ct,
> > > > > >>>>>>          GEM_BUG_ON(!len);
> > > > > >>>>>>          GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> > > > > >>>>>>          GEM_BUG_ON(!response_buf && response_buf_size);
> > > > > >>>>>> +        might_sleep();
> > > > > >>>>>> +        /*
> > > > > >>>>>> +         * We use a lazy spin wait loop here as we believe that if the CT
> > > > > >>>>>> +         * buffers are sized correctly the flow control condition should be
> > > > > >>>>>> +         * rare.
> > > > > >>>>>> +         */
> > > > > >>>>>> +retry:
> > > > > >>>>>>          spin_lock_irqsave(&ct->ctbs.send.lock, flags);
> > > > > >>>>>> +        if (unlikely(!ctb_has_room(ctb, len + 1))) {
> > > > > >>>>>> +                spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
> > > > > >>>>>> +                cond_resched();
> > > > > >>>>>> +                goto retry;
> > > > > >>>>>> +        }
> > > > > >>>>>>
> > > > > >>>>>> So it looks like a change to ct_send to me. Is that wrong?
> > > > > >>>>
> > > > > >>>> What about this part - is the patch changing the blocking ct_send or not,
> > > > > >>>> and if it is why?
> > > > > >>>>
> > > > > >>>
> > > > > >>> Yes, ct_send() changes. Sorry for the confusion.
> > > > > >>>
> > > > > >>> This function needs to be updated to account for the H2G space and
> > > > > >>> backoff if no space is available.
> > > > > >>
> > > > > >> Since this one is the sleeping path, it probably can and needs to be smarter
> > > > > >> than having a cond_resched busy loop added. Like sleep and get woken up when
> > > > > >> there is space. Otherwise it can degenerate to busy looping via contention
> > > > > >> with the non-blocking path.
> > > > > >>
> > > > > >
> > > > > > That screams over enginerring a simple problem to me. If the CT channel
> > > > > > is full we are really in trouble anyways - i.e. the performance is going
> > > > > > to terrible as we overwhelmed the GuC with traffic. That being said,
> > > > >
> > > > > Performance of what would be terrible? Something relating to submitting
> > > > > new jobs to the GPU I guess. Or something SRIOV related as you hint below.
> > > > >
> > > > > But there is no real reason why CPU cycles/power should suffer if GuC is
> > > > > busy.
> > > > >
> > > > > Okay, if it can't happen in real world then it's possibly passable as a
> > > > > design of a communication interface. But to me it leaves a bad taste and
> > > > > a doubt that there is this other aspect of the real world. And that is
> > > > > when the unexpected happens. Even the most trivial things like a bug in
> > > > > GuC firmware causes the driver to busy spin in there. So not much
> > > > > happening on the machine but CPU cores pinned burning cycles in this
> > > > > code. It's just lazy and not robust design. "Bug #nnnnn - High CPU usage
> > > > > and GUI blocked - Solution: Upgrade GuC firmware and _reboot_ the
> > > > > machine". Oh well..
> > > > >
> > > > > At least I think the commit message should spell out clearly that a busy
> > > > > looping path is being added to the sleeping send as a downside of
> > > > > implementation choices. Still, for the record, I object to the design.
> > > > >
> > > > > > IGTs can do this but that really isn't a real world use case. For the
> > > > > > real world, this buffer is large enough that it won't ever be full hence
> > > > > > the comment + lazy spin loop.
> > > > > >
> > > > > > Next, it isn't like we get an interrupt or something when space
> > > > > > becomes available so how would we wake this thread? Could we come up
> > > > > > with a convoluted scheme where we insert ops that generated an interrupt
> > > > > > at regular intervals, probably? Would it be super complicated, totally
> > > > > > unnecessary, and gain use nothing - absolutely.
> > > > > >
> > > > > > Lastly, blocking CTBs really shouldn't ever be used. Certainly the
> > > > > > submission code doesn't use these. I think SRIOV might, but those can
> > > > > > probably be reworked too to use non-blocking. At some point we might
> > > > > > want to scrub the driver and just delete the blocking path.
> > > > 
> > > > I'd do an s/cond_resched()/msleep(1)/ and comment explaining why we
> > > > just don't care about this. That checks of the cpu wasting in this
> > > > case (GuC is overloaded, it wont come back anytime soon anyway) and
> > > > explains why we really don't want to make this any more clever or
> > > > complex code (because comment can explain why we wont hit this in
> > > > actual real world usage except when something else is on fire already
> > > > anyway).
> > > > 
> > > 
> > > Sounds good.
> > > 
> > > > If you want to go absolutely overkill and it's not too much work, make
> > > > the msleep interruptible or check for signals, and bail out. That way
> > > > the process can be made unstuck with ^C at least.
> > > 
> > > This loop is already bound by a timer and if no forward progress is made
> > > we pop out of this loop. It is assumed if this happens the GuC / GPU is
> > > dead a and full GPU reset will have to be issued. A following patch
> > > adds the timer, a bit later in submission section of the series a patch
> > > is added to trigger the reset.
> > 
> > Yeah timeout bail-out works too, and if you then switch it from timeout to
> > also interruptible it shouldn't be much more code. It's just nice to not
> > have any uninterruptible sleep.
> > -Daniel
> 
> I didn't get this in my next rev as I didn't know how to do this off
> hand but I think all I need to add is something like this to each
> iteration of the busy loops, right?
> 
> if (signal_pending_state(TASK_INTERRUPTIBLE, current))
> 	bail out of busy loop, and return and error

Yeah. Or since this is all hopeless already anyway, go with
msleep_interruptible().
-Daniel
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index c20f3839de12..4c0a367e41d8 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -75,7 +75,15 @@  static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
 static
 inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
 {
-	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0);
+	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
+}
+
+#define INTEL_GUC_SEND_NB		BIT(31)
+static
+inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len)
+{
+	return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
+				 INTEL_GUC_SEND_NB);
 }
 
 static inline int
@@ -83,7 +91,7 @@  intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
 			   u32 *response_buf, u32 response_buf_size)
 {
 	return intel_guc_ct_send(&guc->ct, action, len,
-				 response_buf, response_buf_size);
+				 response_buf, response_buf_size, 0);
 }
 
 static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index a76603537fa8..af7314d45a78 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -3,6 +3,11 @@ 
  * Copyright © 2016-2019 Intel Corporation
  */
 
+#include <linux/circ_buf.h>
+#include <linux/ktime.h>
+#include <linux/time64.h>
+#include <linux/timekeeping.h>
+
 #include "i915_drv.h"
 #include "intel_guc_ct.h"
 #include "gt/intel_gt.h"
@@ -308,6 +313,7 @@  int intel_guc_ct_enable(struct intel_guc_ct *ct)
 	if (unlikely(err))
 		goto err_deregister;
 
+	ct->requests.last_fence = 1;
 	ct->enabled = true;
 
 	return 0;
@@ -343,10 +349,22 @@  static u32 ct_get_next_fence(struct intel_guc_ct *ct)
 	return ++ct->requests.last_fence;
 }
 
+static void write_barrier(struct intel_guc_ct *ct) {
+	struct intel_guc *guc = ct_to_guc(ct);
+	struct intel_gt *gt = guc_to_gt(guc);
+
+	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
+		GEM_BUG_ON(guc->send_regs.fw_domains);
+		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
+	} else {
+		wmb();
+	}
+}
+
 static int ct_write(struct intel_guc_ct *ct,
 		    const u32 *action,
 		    u32 len /* in dwords */,
-		    u32 fence)
+		    u32 fence, u32 flags)
 {
 	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
 	struct guc_ct_buffer_desc *desc = ctb->desc;
@@ -393,9 +411,13 @@  static int ct_write(struct intel_guc_ct *ct,
 		 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
 		 FIELD_PREP(GUC_CTB_MSG_0_FENCE, fence);
 
-	hxg = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
-	      FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
-			 GUC_HXG_REQUEST_MSG_0_DATA0, action[0]);
+	hxg = (flags & INTEL_GUC_SEND_NB) ?
+		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
+		 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
+			    GUC_HXG_EVENT_MSG_0_DATA0, action[0])) :
+		(FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
+		 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION |
+			    GUC_HXG_REQUEST_MSG_0_DATA0, action[0]));
 
 	CT_DEBUG(ct, "writing (tail %u) %*ph %*ph %*ph\n",
 		 tail, 4, &header, 4, &hxg, 4 * (len - 1), &action[1]);
@@ -412,6 +434,12 @@  static int ct_write(struct intel_guc_ct *ct,
 	}
 	GEM_BUG_ON(tail > size);
 
+	/*
+	 * make sure H2G buffer update and LRC tail update (if this triggering a
+	 * submission) are visable before updating the descriptor tail
+	 */
+	write_barrier(ct);
+
 	/* now update descriptor */
 	WRITE_ONCE(desc->tail, tail);
 
@@ -466,6 +494,46 @@  static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
 	return err;
 }
 
+static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
+{
+	struct guc_ct_buffer_desc *desc = ctb->desc;
+	u32 head = READ_ONCE(desc->head);
+	u32 space;
+
+	space = CIRC_SPACE(desc->tail, head, ctb->size);
+
+	return space >= len_dw;
+}
+
+static int ct_send_nb(struct intel_guc_ct *ct,
+		      const u32 *action,
+		      u32 len,
+		      u32 flags)
+{
+	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
+	unsigned long spin_flags;
+	u32 fence;
+	int ret;
+
+	spin_lock_irqsave(&ctb->lock, spin_flags);
+
+	ret = ctb_has_room(ctb, len + 1);
+	if (unlikely(ret))
+		goto out;
+
+	fence = ct_get_next_fence(ct);
+	ret = ct_write(ct, action, len, fence, flags);
+	if (unlikely(ret))
+		goto out;
+
+	intel_guc_notify(ct_to_guc(ct));
+
+out:
+	spin_unlock_irqrestore(&ctb->lock, spin_flags);
+
+	return ret;
+}
+
 static int ct_send(struct intel_guc_ct *ct,
 		   const u32 *action,
 		   u32 len,
@@ -473,6 +541,7 @@  static int ct_send(struct intel_guc_ct *ct,
 		   u32 response_buf_size,
 		   u32 *status)
 {
+	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
 	struct ct_request request;
 	unsigned long flags;
 	u32 fence;
@@ -482,8 +551,20 @@  static int ct_send(struct intel_guc_ct *ct,
 	GEM_BUG_ON(!len);
 	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
 	GEM_BUG_ON(!response_buf && response_buf_size);
+	might_sleep();
 
+	/*
+	 * We use a lazy spin wait loop here as we believe that if the CT
+	 * buffers are sized correctly the flow control condition should be
+	 * rare.
+	 */
+retry:
 	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
+	if (unlikely(!ctb_has_room(ctb, len + 1))) {
+		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
+		cond_resched();
+		goto retry;
+	}
 
 	fence = ct_get_next_fence(ct);
 	request.fence = fence;
@@ -495,7 +576,7 @@  static int ct_send(struct intel_guc_ct *ct,
 	list_add_tail(&request.link, &ct->requests.pending);
 	spin_unlock(&ct->requests.lock);
 
-	err = ct_write(ct, action, len, fence);
+	err = ct_write(ct, action, len, fence, 0);
 
 	spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
 
@@ -537,7 +618,7 @@  static int ct_send(struct intel_guc_ct *ct,
  * Command Transport (CT) buffer based GuC send function.
  */
 int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
-		      u32 *response_buf, u32 response_buf_size)
+		      u32 *response_buf, u32 response_buf_size, u32 flags)
 {
 	u32 status = ~0; /* undefined */
 	int ret;
@@ -547,6 +628,9 @@  int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
 		return -ENODEV;
 	}
 
+	if (flags & INTEL_GUC_SEND_NB)
+		return ct_send_nb(ct, action, len, flags);
+
 	ret = ct_send(ct, action, len, response_buf, response_buf_size, &status);
 	if (unlikely(ret < 0)) {
 		CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n",
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
index 1ae2dde6db93..55ef7c52472f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
@@ -9,6 +9,7 @@ 
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
+#include <linux/ktime.h>
 
 #include "intel_guc_fwif.h"
 
@@ -42,7 +43,6 @@  struct intel_guc_ct_buffer {
 	bool broken;
 };
 
-
 /** Top-level structure for Command Transport related data
  *
  * Includes a pair of CT buffers for bi-directional communication and tracking
@@ -69,6 +69,9 @@  struct intel_guc_ct {
 		struct list_head incoming; /* incoming requests */
 		struct work_struct worker; /* handler for incoming requests */
 	} requests;
+
+	/** @stall_time: time of first time a CTB submission is stalled */
+	ktime_t stall_time;
 };
 
 void intel_guc_ct_init_early(struct intel_guc_ct *ct);
@@ -88,7 +91,7 @@  static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct)
 }
 
 int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
-		      u32 *response_buf, u32 response_buf_size);
+		      u32 *response_buf, u32 response_buf_size, u32 flags);
 void intel_guc_ct_event_handler(struct intel_guc_ct *ct);
 
 #endif /* _INTEL_GUC_CT_H_ */