diff mbox series

[3/3] drm/i915/hwmon: Block waiting for GuC reset to complete

Message ID 20230406044522.3108359-4-ashutosh.dixit@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/guc: Disable PL1 power limit when loading GuC firmware | expand

Commit Message

Dixit, Ashutosh April 6, 2023, 4:45 a.m. UTC
Instead of erroring out when GuC reset is in progress, block waiting for
GuC reset to complete which is a more reasonable uapi behavior.

Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
---
 drivers/gpu/drm/i915/i915_hwmon.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

Comments

Rodrigo Vivi April 7, 2023, 11:04 a.m. UTC | #1
On Wed, Apr 05, 2023 at 09:45:22PM -0700, Ashutosh Dixit wrote:
> Instead of erroring out when GuC reset is in progress, block waiting for
> GuC reset to complete which is a more reasonable uapi behavior.
> 
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_hwmon.c | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> index 9ab8971679fe3..4343efb48e61b 100644
> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> @@ -51,6 +51,7 @@ struct hwm_drvdata {
>  	char name[12];
>  	int gt_n;
>  	bool reset_in_progress;
> +	wait_queue_head_t wqh;
>  };
>  
>  struct i915_hwmon {
> @@ -400,10 +401,15 @@ hwm_power_max_write(struct hwm_drvdata *ddat, long val)
>  	int ret = 0;
>  	u32 nval;
>  
> +retry:
>  	mutex_lock(&hwmon->hwmon_lock);
>  	if (hwmon->ddat.reset_in_progress) {
> -		ret = -EAGAIN;
> -		goto unlock;
> +		mutex_unlock(&hwmon->hwmon_lock);
> +		ret = wait_event_interruptible(ddat->wqh,
> +					       !hwmon->ddat.reset_in_progress);

this is indeed very clever!
maybe just use the timeout version to be on the safeside and then return the
-EAGAIN on timeout?

my fear is probably due to the lack of knowledge on this wait queue, but
I'm wondering what could go wrong if due to some funny race you enter this
check right after wake_up_all below has passed and then you will be here
indefinitely waiting...

> +		if (ret)
> +			return ret;
> +		goto retry;
>  	}
>  	wakeref = intel_runtime_pm_get(ddat->uncore->rpm);
>  
> @@ -426,7 +432,6 @@ hwm_power_max_write(struct hwm_drvdata *ddat, long val)
>  			 PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, nval);
>  exit:
>  	intel_runtime_pm_put(ddat->uncore->rpm, wakeref);
> -unlock:
>  	mutex_unlock(&hwmon->hwmon_lock);
>  	return ret;
>  }
> @@ -508,6 +513,7 @@ void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
>  	intel_uncore_rmw(hwmon->ddat.uncore, hwmon->rg.pkg_rapl_limit,
>  			 PKG_PWR_LIM_1_EN, old ? PKG_PWR_LIM_1_EN : 0);
>  	hwmon->ddat.reset_in_progress = false;
> +	wake_up_all(&hwmon->ddat.wqh);
>  
>  	mutex_unlock(&hwmon->hwmon_lock);
>  }
> @@ -784,6 +790,7 @@ void i915_hwmon_register(struct drm_i915_private *i915)
>  	ddat->uncore = &i915->uncore;
>  	snprintf(ddat->name, sizeof(ddat->name), "i915");
>  	ddat->gt_n = -1;
> +	init_waitqueue_head(&ddat->wqh);
>  
>  	for_each_gt(gt, i915, i) {
>  		ddat_gt = hwmon->ddat_gt + i;
> -- 
> 2.38.0
>
Dixit, Ashutosh April 10, 2023, 10:40 p.m. UTC | #2
On Fri, 07 Apr 2023 04:04:06 -0700, Rodrigo Vivi wrote:
>

Hi Rodrigo,

> On Wed, Apr 05, 2023 at 09:45:22PM -0700, Ashutosh Dixit wrote:
> > Instead of erroring out when GuC reset is in progress, block waiting for
> > GuC reset to complete which is a more reasonable uapi behavior.
> >
> > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_hwmon.c | 13 ++++++++++---
> >  1 file changed, 10 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> > index 9ab8971679fe3..4343efb48e61b 100644
> > --- a/drivers/gpu/drm/i915/i915_hwmon.c
> > +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> > @@ -51,6 +51,7 @@ struct hwm_drvdata {
> >	char name[12];
> >	int gt_n;
> >	bool reset_in_progress;
> > +	wait_queue_head_t wqh;
> >  };
> >
> >  struct i915_hwmon {
> > @@ -400,10 +401,15 @@ hwm_power_max_write(struct hwm_drvdata *ddat, long val)
> >	int ret = 0;
> >	u32 nval;
> >
> > +retry:
> >	mutex_lock(&hwmon->hwmon_lock);
> >	if (hwmon->ddat.reset_in_progress) {
> > -		ret = -EAGAIN;
> > -		goto unlock;
> > +		mutex_unlock(&hwmon->hwmon_lock);
> > +		ret = wait_event_interruptible(ddat->wqh,
> > +					       !hwmon->ddat.reset_in_progress);
>
> this is indeed very clever!

Not clever, see below :/

> my fear is probably due to the lack of knowledge on this wait queue, but
> I'm wondering what could go wrong if due to some funny race you enter this
> check right after wake_up_all below has passed and then you will be here
> indefinitely waiting...

You are absolutely right, there is indeed a race in the patch because in
the above code when we drop the mutex (mutex_unlock) the wake_up_all can
happen before we have queued ourselves for the wake up.

Solving this race needs a more complicated prepare_to_wait/finish_wait
sequence which I have gone ahead and implemented in patch v2. The v2 code
is also a standard code pattern and the pattern I have implemented is
basically the same as that in intel_guc_wait_for_pending_msg() in i915
which I liked.

I have read in several places (e.g. in the Advanced Sleeping section in
https://static.lwn.net/images/pdf/LDD3/ch06.pdf and in kernel documentation
for try_to_wake_up()) that this sequence will avoid the race (between
schedule() and wake_up()). The crucial difference from the v1 patch is that
in v2 the mutex is dropped after we queue ourselves in prepare_to_wait()
just before calling schedule_timeout().

> maybe just use the timeout version to be on the safeside and then return the
> -EAGAIN on timeout?

Also incorporated timeout in the new version. All code paths in the new
patch have been tested.

Thanks.
--
Ashutosh

> > +		if (ret)
> > +			return ret;
> > +		goto retry;
> >	}
> >	wakeref = intel_runtime_pm_get(ddat->uncore->rpm);
> >
> > @@ -426,7 +432,6 @@ hwm_power_max_write(struct hwm_drvdata *ddat, long val)
> >			 PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, nval);
> >  exit:
> >	intel_runtime_pm_put(ddat->uncore->rpm, wakeref);
> > -unlock:
> >	mutex_unlock(&hwmon->hwmon_lock);
> >	return ret;
> >  }
> > @@ -508,6 +513,7 @@ void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
> >	intel_uncore_rmw(hwmon->ddat.uncore, hwmon->rg.pkg_rapl_limit,
> >			 PKG_PWR_LIM_1_EN, old ? PKG_PWR_LIM_1_EN : 0);
> >	hwmon->ddat.reset_in_progress = false;
> > +	wake_up_all(&hwmon->ddat.wqh);
> >
> >	mutex_unlock(&hwmon->hwmon_lock);
> >  }
> > @@ -784,6 +790,7 @@ void i915_hwmon_register(struct drm_i915_private *i915)
> >	ddat->uncore = &i915->uncore;
> >	snprintf(ddat->name, sizeof(ddat->name), "i915");
> >	ddat->gt_n = -1;
> > +	init_waitqueue_head(&ddat->wqh);
> >
> >	for_each_gt(gt, i915, i) {
> >		ddat_gt = hwmon->ddat_gt + i;
> > --
> > 2.38.0
> >
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
index 9ab8971679fe3..4343efb48e61b 100644
--- a/drivers/gpu/drm/i915/i915_hwmon.c
+++ b/drivers/gpu/drm/i915/i915_hwmon.c
@@ -51,6 +51,7 @@  struct hwm_drvdata {
 	char name[12];
 	int gt_n;
 	bool reset_in_progress;
+	wait_queue_head_t wqh;
 };
 
 struct i915_hwmon {
@@ -400,10 +401,15 @@  hwm_power_max_write(struct hwm_drvdata *ddat, long val)
 	int ret = 0;
 	u32 nval;
 
+retry:
 	mutex_lock(&hwmon->hwmon_lock);
 	if (hwmon->ddat.reset_in_progress) {
-		ret = -EAGAIN;
-		goto unlock;
+		mutex_unlock(&hwmon->hwmon_lock);
+		ret = wait_event_interruptible(ddat->wqh,
+					       !hwmon->ddat.reset_in_progress);
+		if (ret)
+			return ret;
+		goto retry;
 	}
 	wakeref = intel_runtime_pm_get(ddat->uncore->rpm);
 
@@ -426,7 +432,6 @@  hwm_power_max_write(struct hwm_drvdata *ddat, long val)
 			 PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, nval);
 exit:
 	intel_runtime_pm_put(ddat->uncore->rpm, wakeref);
-unlock:
 	mutex_unlock(&hwmon->hwmon_lock);
 	return ret;
 }
@@ -508,6 +513,7 @@  void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
 	intel_uncore_rmw(hwmon->ddat.uncore, hwmon->rg.pkg_rapl_limit,
 			 PKG_PWR_LIM_1_EN, old ? PKG_PWR_LIM_1_EN : 0);
 	hwmon->ddat.reset_in_progress = false;
+	wake_up_all(&hwmon->ddat.wqh);
 
 	mutex_unlock(&hwmon->hwmon_lock);
 }
@@ -784,6 +790,7 @@  void i915_hwmon_register(struct drm_i915_private *i915)
 	ddat->uncore = &i915->uncore;
 	snprintf(ddat->name, sizeof(ddat->name), "i915");
 	ddat->gt_n = -1;
+	init_waitqueue_head(&ddat->wqh);
 
 	for_each_gt(gt, i915, i) {
 		ddat_gt = hwmon->ddat_gt + i;