Message ID | 20230516093030.1220526-5-ogabbay@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/12] accel/habanalabs: rename security functions related arguments | expand |
On 16/05/2023 12:30, Oded Gabbay wrote: > If a workload got stuck, we print an error to the kernel log about it. > Add to that print the configured max timeout value, as that value is > not fixed between ASICs and in addition it can be configured using > a kernel module parameter. > > Signed-off-by: Oded Gabbay <ogabbay@kernel.org> > --- > .../habanalabs/common/command_submission.c | 26 +++++++++++-------- > 1 file changed, 15 insertions(+), 11 deletions(-) > > diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c > index ccf68f482948..4ec28af3ed78 100644 > --- a/drivers/accel/habanalabs/common/command_submission.c > +++ b/drivers/accel/habanalabs/common/command_submission.c > @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref) > > static void cs_timedout(struct work_struct *work) > { > + struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); > + bool skip_reset_on_timeout, device_reset = false; > struct hl_device *hdev; > u64 event_mask = 0x0; > + uint timeout_sec; > int rc; > - struct hl_cs *cs = container_of(work, struct hl_cs, > - work_tdr.work); > - bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; > + > + skip_reset_on_timeout = cs->skip_reset_on_timeout; > > rc = cs_get_unless_zero(cs); > if (!rc) > @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work) > event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; > } > > + timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000; > + > switch (cs->type) { > case CS_TYPE_SIGNAL: > dev_err(hdev->dev, > - "Signal command submission %llu has not finished in time!\n", > - cs->sequence); > + "Signal command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > > case CS_TYPE_WAIT: > dev_err(hdev->dev, > - "Wait command submission %llu has not finished in time!\n", > - cs->sequence); > + "Wait command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > > case CS_TYPE_COLLECTIVE_WAIT: > dev_err(hdev->dev, > - "Collective Wait command submission %llu has not finished in time!\n", > - cs->sequence); > + "Collective Wait command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > > default: > dev_err(hdev->dev, > - "Command submission %llu has not finished in time!\n", > - cs->sequence); > + "Command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > } > Reviewed-by: Ofir Bitton<obitton@habana.ai>
diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index ccf68f482948..4ec28af3ed78 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref) static void cs_timedout(struct work_struct *work) { + struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); + bool skip_reset_on_timeout, device_reset = false; struct hl_device *hdev; u64 event_mask = 0x0; + uint timeout_sec; int rc; - struct hl_cs *cs = container_of(work, struct hl_cs, - work_tdr.work); - bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; + + skip_reset_on_timeout = cs->skip_reset_on_timeout; rc = cs_get_unless_zero(cs); if (!rc) @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work) event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; } + timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000; + switch (cs->type) { case CS_TYPE_SIGNAL: dev_err(hdev->dev, - "Signal command submission %llu has not finished in time!\n", - cs->sequence); + "Signal command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; case CS_TYPE_WAIT: dev_err(hdev->dev, - "Wait command submission %llu has not finished in time!\n", - cs->sequence); + "Wait command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; case CS_TYPE_COLLECTIVE_WAIT: dev_err(hdev->dev, - "Collective Wait command submission %llu has not finished in time!\n", - cs->sequence); + "Collective Wait command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; default: dev_err(hdev->dev, - "Command submission %llu has not finished in time!\n", - cs->sequence); + "Command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; }
If a workload got stuck, we print an error to the kernel log about it. Add to that print the configured max timeout value, as that value is not fixed between ASICs and in addition it can be configured using a kernel module parameter. Signed-off-by: Oded Gabbay <ogabbay@kernel.org> --- .../habanalabs/common/command_submission.c | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-)