diff mbox series

[05/12] accel/habanalabs: print max timeout value on CS stuck

Message ID 20230516093030.1220526-5-ogabbay@kernel.org (mailing list archive)
State New, archived
Headers show
Series [01/12] accel/habanalabs: rename security functions related arguments | expand

Commit Message

Oded Gabbay May 16, 2023, 9:30 a.m. UTC
If a workload got stuck, we print an error to the kernel log about it.
Add to that print the configured max timeout value, as that value is
not fixed between ASICs and in addition it can be configured using
a kernel module parameter.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

Comments

Ofir Bitton May 17, 2023, 6:01 p.m. UTC | #1
On 16/05/2023 12:30, Oded Gabbay wrote:
> If a workload got stuck, we print an error to the kernel log about it.
> Add to that print the configured max timeout value, as that value is
> not fixed between ASICs and in addition it can be configured using
> a kernel module parameter.
>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
> ---
>   .../habanalabs/common/command_submission.c    | 26 +++++++++++--------
>   1 file changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
> index ccf68f482948..4ec28af3ed78 100644
> --- a/drivers/accel/habanalabs/common/command_submission.c
> +++ b/drivers/accel/habanalabs/common/command_submission.c
> @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
>   
>   static void cs_timedout(struct work_struct *work)
>   {
> +	struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
> +	bool skip_reset_on_timeout, device_reset = false;
>   	struct hl_device *hdev;
>   	u64 event_mask = 0x0;
> +	uint timeout_sec;
>   	int rc;
> -	struct hl_cs *cs = container_of(work, struct hl_cs,
> -						 work_tdr.work);
> -	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
> +
> +	skip_reset_on_timeout = cs->skip_reset_on_timeout;
>   
>   	rc = cs_get_unless_zero(cs);
>   	if (!rc)
> @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
>   		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
>   	}
>   
> +	timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
> +
>   	switch (cs->type) {
>   	case CS_TYPE_SIGNAL:
>   		dev_err(hdev->dev,
> -			"Signal command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Signal command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	case CS_TYPE_WAIT:
>   		dev_err(hdev->dev,
> -			"Wait command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Wait command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	case CS_TYPE_COLLECTIVE_WAIT:
>   		dev_err(hdev->dev,
> -			"Collective Wait command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Collective Wait command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	default:
>   		dev_err(hdev->dev,
> -			"Command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   	}
>   

Reviewed-by: Ofir Bitton<obitton@habana.ai>
diff mbox series

Patch

diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index ccf68f482948..4ec28af3ed78 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -804,12 +804,14 @@  static void cs_do_release(struct kref *ref)
 
 static void cs_timedout(struct work_struct *work)
 {
+	struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
+	bool skip_reset_on_timeout, device_reset = false;
 	struct hl_device *hdev;
 	u64 event_mask = 0x0;
+	uint timeout_sec;
 	int rc;
-	struct hl_cs *cs = container_of(work, struct hl_cs,
-						 work_tdr.work);
-	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
+
+	skip_reset_on_timeout = cs->skip_reset_on_timeout;
 
 	rc = cs_get_unless_zero(cs);
 	if (!rc)
@@ -840,29 +842,31 @@  static void cs_timedout(struct work_struct *work)
 		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
 	}
 
+	timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
+
 	switch (cs->type) {
 	case CS_TYPE_SIGNAL:
 		dev_err(hdev->dev,
-			"Signal command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Signal command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 
 	case CS_TYPE_WAIT:
 		dev_err(hdev->dev,
-			"Wait command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Wait command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 
 	case CS_TYPE_COLLECTIVE_WAIT:
 		dev_err(hdev->dev,
-			"Collective Wait command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Collective Wait command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 
 	default:
 		dev_err(hdev->dev,
-			"Command submission %llu has not finished in time!\n",
-			cs->sequence);
+			"Command submission %llu has not finished in %u seconds!\n",
+			cs->sequence, timeout_sec);
 		break;
 	}