diff mbox

[1/3] drm/i915: Use consistent forcewake auto-release timeout across kernel configs

Message ID 1459788671-17501-1-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin April 4, 2016, 4:51 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Current implementation releases the forcewake at any time between
straight away, and one jiffie from the last put, or first automatic
grab.

This does not sound like what was desired since jiffies are typically
between 1 and 10ms depending on the kernel configuration.

Change the auto-release mechanism to use hrtimers and set the timeout
to 1ms with a 1ms of slack. This should make the GPU power consistent
across kernel configs and the slack should enable some timer coallescing
where multiple force-wake domains exist, or with unrelated timers.

For GlBench/T-Rex this decreases the number of forcewake releases from
~480 to ~300 per second, and for a heavy combined OGL/OCL test from
~670 to ~360.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  2 +-
 drivers/gpu/drm/i915/intel_uncore.c | 25 ++++++++++++++++---------
 2 files changed, 17 insertions(+), 10 deletions(-)

Comments

Chris Wilson April 4, 2016, 6:58 p.m. UTC | #1
On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Current implementation releases the forcewake at any time between
> straight away, and one jiffie from the last put, or first automatic
> grab.

That isn't the problem though. The problem is that we set the timer on
first use rather than last use. All you are stating here is that by
lengthening the timeout on your system you reduce the number of times it
times out.

Having said that, the conversion to hrtimer seems sensible but to add
tracking of the last access as opposed to first we either fallback to
jiffie (in which case hrtimer is moot) or rely on ktime_get_raw() being
fast enough for every register write. Hmm, my usual response to that has
been if it matters we avoid the heavyweight macros and use the _FW
interface - or even raw readl/writel.

Could you try storing ktime_get_raw() on every access and rearming the
timer if it expires before last-access + min-period?
-Chris
Dave Gordon April 4, 2016, 6:58 p.m. UTC | #2
On 04/04/16 17:51, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Current implementation releases the forcewake at any time between
> straight away, and one jiffie from the last put, or first automatic
> grab.
>
> This does not sound like what was desired since jiffies are typically
> between 1 and 10ms depending on the kernel configuration.
>
> Change the auto-release mechanism to use hrtimers and set the timeout
> to 1ms with a 1ms of slack. This should make the GPU power consistent
> across kernel configs and the slack should enable some timer coallescing

"coalescing"

> where multiple force-wake domains exist, or with unrelated timers.
>
> For GlBench/T-Rex this decreases the number of forcewake releases from
> ~480 to ~300 per second, and for a heavy combined OGL/OCL test from
> ~670 to ~360.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

LGTM.

Reviewed-by: Dave Gordon <david.s.gordon@intel.com>

> ---
>   drivers/gpu/drm/i915/i915_drv.h     |  2 +-
>   drivers/gpu/drm/i915/intel_uncore.c | 25 ++++++++++++++++---------
>   2 files changed, 17 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index dd187727c813..7d4c704d7d75 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -666,7 +666,7 @@ struct intel_uncore {
>   		struct drm_i915_private *i915;
>   		enum forcewake_domain_id id;
>   		unsigned wake_count;
> -		struct timer_list timer;
> +		struct hrtimer timer;
>   		i915_reg_t reg_set;
>   		u32 val_set;
>   		u32 val_clear;
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index ac1c545436af..76ac990de354 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -60,7 +60,11 @@ fw_domain_reset(const struct intel_uncore_forcewake_domain *d)
>   static inline void
>   fw_domain_arm_timer(struct intel_uncore_forcewake_domain *d)
>   {
> -	mod_timer_pinned(&d->timer, jiffies + 1);
> +	d->wake_count++;
> +	hrtimer_start_range_ns(&d->timer,
> +			       ktime_set(0, NSEC_PER_MSEC),
> +			       NSEC_PER_MSEC,
> +			       HRTIMER_MODE_REL);
>   }
>
>   static inline void
> @@ -224,9 +228,11 @@ static int __gen6_gt_wait_for_fifo(struct drm_i915_private *dev_priv)
>   	return ret;
>   }
>
> -static void intel_uncore_fw_release_timer(unsigned long arg)
> +static enum hrtimer_restart
> +intel_uncore_fw_release_timer(struct hrtimer *timer)
>   {
> -	struct intel_uncore_forcewake_domain *domain = (void *)arg;
> +	struct intel_uncore_forcewake_domain *domain =
> +	       container_of(timer, struct intel_uncore_forcewake_domain, timer);
>   	unsigned long irqflags;
>
>   	assert_rpm_device_not_suspended(domain->i915);
> @@ -240,6 +246,8 @@ static void intel_uncore_fw_release_timer(unsigned long arg)
>   							  1 << domain->id);
>
>   	spin_unlock_irqrestore(&domain->i915->uncore.lock, irqflags);
> +
> +	return HRTIMER_NORESTART;
>   }
>
>   void intel_uncore_forcewake_reset(struct drm_device *dev, bool restore)
> @@ -259,16 +267,16 @@ void intel_uncore_forcewake_reset(struct drm_device *dev, bool restore)
>   		active_domains = 0;
>
>   		for_each_fw_domain(domain, dev_priv, id) {
> -			if (del_timer_sync(&domain->timer) == 0)
> +			if (hrtimer_cancel(&domain->timer) == 0)
>   				continue;
>
> -			intel_uncore_fw_release_timer((unsigned long)domain);
> +			intel_uncore_fw_release_timer(&domain->timer);
>   		}
>
>   		spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
>
>   		for_each_fw_domain(domain, dev_priv, id) {
> -			if (timer_pending(&domain->timer))
> +			if (hrtimer_active(&domain->timer))
>   				active_domains |= (1 << id);
>   		}
>
> @@ -491,7 +499,6 @@ static void __intel_uncore_forcewake_put(struct drm_i915_private *dev_priv,
>   		if (--domain->wake_count)
>   			continue;
>
> -		domain->wake_count++;
>   		fw_domain_arm_timer(domain);
>   	}
>   }
> @@ -732,7 +739,6 @@ static inline void __force_wake_auto(struct drm_i915_private *dev_priv,
>   			continue;
>   		}
>
> -		domain->wake_count++;
>   		fw_domain_arm_timer(domain);
>   	}
>
> @@ -1150,7 +1156,8 @@ static void fw_domain_init(struct drm_i915_private *dev_priv,
>   	d->i915 = dev_priv;
>   	d->id = domain_id;
>
> -	setup_timer(&d->timer, intel_uncore_fw_release_timer, (unsigned long)d);
> +	hrtimer_init(&d->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +	d->timer.function = intel_uncore_fw_release_timer;
>
>   	dev_priv->uncore.fw_domains |= (1 << domain_id);
>
>
Dave Gordon April 4, 2016, 7:41 p.m. UTC | #3
On 04/04/16 19:58, Chris Wilson wrote:
> On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Current implementation releases the forcewake at any time between
>> straight away, and one jiffie from the last put, or first automatic
>> grab.
>
> That isn't the problem though. The problem is that we set the timer on
> first use rather than last use. All you are stating here is that by
> lengthening the timeout on your system you reduce the number of times it
> times out.

I thought that was already done, per my comments on one of your recent 
patches (rename __force_wake_get to __force_wake_auto) and your reply 
(gains are probably worth extra complexity). But of course Tvrtko wasn't 
here the last few days so may not have seen that. I suppose it would 
make sense to fold both changes together.

But changing the timeout to 1ms makes it generally *shorter* than 
before, not longer. The gain can't just be from having a longer timeout, 
'cos it isn't. (Average timeout now 1.5ms vs. previous 8ms?)

I think it should come from the fact that accesses are likely to be 
bursty, having a bimodal distribution in the time domain. When we've 
made one access, we're likely to make another much sooner than 1ms 
later, but once we stop making accesses there will probably be far more 
than 1ms before the next set of accesses.

> Having said that, the conversion to hrtimer seems sensible but to add
> tracking of the last access as opposed to first we either fallback to
> jiffie (in which case hrtimer is moot) or rely on ktime_get_raw() being
> fast enough for every register write. Hmm, my usual response to that has
> been if it matters we avoid the heavyweight macros and use the _FW
> interface - or even raw readl/writel.
>
> Could you try storing ktime_get_raw() on every access and rearming the
> timer if it expires before last-access + min-period?
> -Chris

One more idea - do we want two different expiry periods depending on 
whether the caller used the _auto version or not?

If they did, they're probably anticipating "only a few" accesses in the 
current operation, but perhaps with no idea about when the next set of 
accesses will occur.

If not, then the final decrement means they think they've finished for a 
while, and perhaps don't expect to come back for quite a long time. In 
that situation the timeout only helps avoid rapid off/on cycles by 
unrelated functions, not between logically-related operations.

So would two different timeouts make sense? Or even ...

_auto: longish timeout from the *first* _get() - don't rearm thereafter
other: shortish timeout from the _put() - non-auto _get() will cancel

or is that just too complicated?

.Dave.
Chris Wilson April 4, 2016, 8:22 p.m. UTC | #4
On Mon, Apr 04, 2016 at 08:41:20PM +0100, Dave Gordon wrote:
> On 04/04/16 19:58, Chris Wilson wrote:
> >On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>Current implementation releases the forcewake at any time between
> >>straight away, and one jiffie from the last put, or first automatic
> >>grab.
> >
> >That isn't the problem though. The problem is that we set the timer on
> >first use rather than last use. All you are stating here is that by
> >lengthening the timeout on your system you reduce the number of times it
> >times out.
> 
> I thought that was already done, per my comments on one of your
> recent patches (rename __force_wake_get to __force_wake_auto) and
> your reply (gains are probably worth extra complexity). But of
> course Tvrtko wasn't here the last few days so may not have seen
> that. I suppose it would make sense to fold both changes together.

We were talking about the cost of having 2 passes vs 1 to do the same
task. Before Tvrtko left, I realised again that we didn't defer the
timer itself for every access, rearming a timer on every register access
seems costly (more so switching to hrtimer). Rearming the timer inside
the timer callback seems a reasonable compromise.
 
> But changing the timeout to 1ms makes it generally *shorter* than
> before, not longer. The gain can't just be from having a longer
> timeout, 'cos it isn't. (Average timeout now 1.5ms vs. previous
> 8ms?)

It is. jiffies+1 does not equal an 8ms delay (it would be 1ms on his
machine anways, 10ms on mine). The issue is that +1 jiffie is up to 1
jiffie and often close to 0, and that very well explains the wakeups
Tvrtko measured.

> I think it should come from the fact that accesses are likely to be
> bursty, having a bimodal distribution in the time domain. When we've
> made one access, we're likely to make another much sooner than 1ms
> later, but once we stop making accesses there will probably be far
> more than 1ms before the next set of accesses.

Yes. We expect temporal coherence in the access patten. That's assuredly
so since we very rarely access one register by itself.

> One more idea - do we want two different expiry periods depending on
> whether the caller used the _auto version or not?

Hard to tell. Broadly speaking we have two users of the non-auto
interface: userspace disabling forcewake indefinitely, and the other is
code that wants to hold the spinlock + forcewake over a bunch of
register writes.
 
> If they did, they're probably anticipating "only a few" accesses in
> the current operation, but perhaps with no idea about when the next
> set of accesses will occur.
> 
> If not, then the final decrement means they think they've finished
> for a while, and perhaps don't expect to come back for quite a long
> time. In that situation the timeout only helps avoid rapid off/on
> cycles by unrelated functions, not between logically-related
> operations.

Execlists. (Wants to hold forcewake over a bunch of reads and writes,
and may be very frequent context switches or very slow)
 
> So would two different timeouts make sense? Or even ...
> 
> _auto: longish timeout from the *first* _get() - don't rearm thereafter
> other: shortish timeout from the _put() - non-auto _get() will cancel
> 
> or is that just too complicated?

The challenge is that I think we should be pushing for shorter timeouts,
because of the bimodality in many accesses followed by a long period of
sleep. One of the original reasons for the choice in 1 jiffie with a
pinned timer was that it should mean that on the next call to schedule()
we should be deasserting forcewake (i.e. we only hold forcewake for this
sequence of register accesses). However, execlists scuppers that by
having an interrupt context that requires forcewake, and frequently so.

So I think we do need a combination of something like task_work_run() to
tie a forcewake reference to the active task with something like an
extra timer to allow frequent wakers to avoid having to retake
forcewake.
-Chris
Tvrtko Ursulin April 5, 2016, 8:54 a.m. UTC | #5
On 04/04/16 19:58, Chris Wilson wrote:
> On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Current implementation releases the forcewake at any time between
>> straight away, and one jiffie from the last put, or first automatic
>> grab.
>
> That isn't the problem though. The problem is that we set the timer on
> first use rather than last use. All you are stating here is that by
> lengthening the timeout on your system you reduce the number of times it
> times out.

It is true the reduction I see is due lengthening of the average timeout.

But with regards to re-arming approach, I thought so initially myself, 
but then, if we expect bursty access then it shouldn't matter and the 
simplicity of doing it like it currently is better.

Even in practice, I noticed the effect of lengthening the timeout is 
much greater than moving the arming to the last access. And to get to 
very few to none auto releases on busy workloads we need in the regions 
of 5ms, which would be a big change. Or maybe not if you consider HZ=100 
kernels.

It is very difficult to know what is actually better considering power 
between the CPU and GPU and performance. So I though the best would be 
to keep it similar to the current timings, just fix the dependency on HZ 
and also slack with hrtimers might help with something.

As a final data point, explicit puts and auto releases seems to be 
relatively balanced in my testing. With this patch T-Rex for example 
auto-releases in the region of 3-4 times in 10ms, with around 5-10 
explicit gets, and 5-10 implicit gets in 10ms.

A different, interrupt heavy workload (~20k irqs/sec) manages similarly 
2-4 auto-releases per 10ms, and has ~250 explicit gets and ~380 implicit 
per 10ms.

Looking at the two I think the fact they manage to auto-release 
relatively similarly, compared to a huge different in fw gets, suggest 
burstyness. So I am not sure that any smarts with the release period 
would be interesting. At least not without serious power/perf measurements.

> Having said that, the conversion to hrtimer seems sensible but to add
> tracking of the last access as opposed to first we either fallback to
> jiffie (in which case hrtimer is moot) or rely on ktime_get_raw() being
> fast enough for every register write. Hmm, my usual response to that has
> been if it matters we avoid the heavyweight macros and use the _FW
> interface - or even raw readl/writel.
>
> Could you try storing ktime_get_raw() on every access and rearming the
> timer if it expires before last-access + min-period?

That would be similar to your patch from before my holiday, right? As I 
said above, I did not notice much change with that approach. Just 
extending the timeout has a much greater effect, but as I wrote above, I 
am not sure we want to change it.

Regards,

Tvrtko
Chris Wilson April 5, 2016, 8:59 a.m. UTC | #6
On Tue, Apr 05, 2016 at 09:54:58AM +0100, Tvrtko Ursulin wrote:
> 
> On 04/04/16 19:58, Chris Wilson wrote:
> >On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>Current implementation releases the forcewake at any time between
> >>straight away, and one jiffie from the last put, or first automatic
> >>grab.
> >
> >That isn't the problem though. The problem is that we set the timer on
> >first use rather than last use. All you are stating here is that by
> >lengthening the timeout on your system you reduce the number of times it
> >times out.
> 
> It is true the reduction I see is due lengthening of the average timeout.
> 
> But with regards to re-arming approach, I thought so initially
> myself, but then, if we expect bursty access then it shouldn't
> matter and the simplicity of doing it like it currently is better.
> 
> Even in practice, I noticed the effect of lengthening the timeout is
> much greater than moving the arming to the last access. And to get
> to very few to none auto releases on busy workloads we need in the
> regions of 5ms, which would be a big change. Or maybe not if you
> consider HZ=100 kernels.
> 
> It is very difficult to know what is actually better considering
> power between the CPU and GPU and performance. So I though the best
> would be to keep it similar to the current timings, just fix the
> dependency on HZ and also slack with hrtimers might help with
> something.
> 
> As a final data point, explicit puts and auto releases seems to be
> relatively balanced in my testing. With this patch T-Rex for example
> auto-releases in the region of 3-4 times in 10ms, with around 5-10
> explicit gets, and 5-10 implicit gets in 10ms.
> 
> A different, interrupt heavy workload (~20k irqs/sec) manages
> similarly 2-4 auto-releases per 10ms, and has ~250 explicit gets and
> ~380 implicit per 10ms.
> 
> Looking at the two I think the fact they manage to auto-release
> relatively similarly, compared to a huge different in fw gets,
> suggest burstyness. So I am not sure that any smarts with the
> release period would be interesting. At least not without serious
> power/perf measurements.
> 
> >Having said that, the conversion to hrtimer seems sensible but to add
> >tracking of the last access as opposed to first we either fallback to
> >jiffie (in which case hrtimer is moot) or rely on ktime_get_raw() being
> >fast enough for every register write. Hmm, my usual response to that has
> >been if it matters we avoid the heavyweight macros and use the _FW
> >interface - or even raw readl/writel.
> >
> >Could you try storing ktime_get_raw() on every access and rearming the
> >timer if it expires before last-access + min-period?
> 
> That would be similar to your patch from before my holiday, right?
> As I said above, I did not notice much change with that approach.
> Just extending the timeout has a much greater effect, but as I wrote
> above, I am not sure we want to change it.

There was just one bug in that patch in checking the expiration that
makes a huge difference, but if you please add the discussion above to
the commit log that would be invaluable.
-Chris
Tvrtko Ursulin April 5, 2016, 10:02 a.m. UTC | #7
On 05/04/16 09:59, Chris Wilson wrote:
> On Tue, Apr 05, 2016 at 09:54:58AM +0100, Tvrtko Ursulin wrote:
>>
>> On 04/04/16 19:58, Chris Wilson wrote:
>>> On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Current implementation releases the forcewake at any time between
>>>> straight away, and one jiffie from the last put, or first automatic
>>>> grab.
>>>
>>> That isn't the problem though. The problem is that we set the timer on
>>> first use rather than last use. All you are stating here is that by
>>> lengthening the timeout on your system you reduce the number of times it
>>> times out.
>>
>> It is true the reduction I see is due lengthening of the average timeout.
>>
>> But with regards to re-arming approach, I thought so initially
>> myself, but then, if we expect bursty access then it shouldn't
>> matter and the simplicity of doing it like it currently is better.
>>
>> Even in practice, I noticed the effect of lengthening the timeout is
>> much greater than moving the arming to the last access. And to get
>> to very few to none auto releases on busy workloads we need in the
>> regions of 5ms, which would be a big change. Or maybe not if you
>> consider HZ=100 kernels.
>>
>> It is very difficult to know what is actually better considering
>> power between the CPU and GPU and performance. So I though the best
>> would be to keep it similar to the current timings, just fix the
>> dependency on HZ and also slack with hrtimers might help with
>> something.
>>
>> As a final data point, explicit puts and auto releases seems to be
>> relatively balanced in my testing. With this patch T-Rex for example
>> auto-releases in the region of 3-4 times in 10ms, with around 5-10
>> explicit gets, and 5-10 implicit gets in 10ms.
>>
>> A different, interrupt heavy workload (~20k irqs/sec) manages
>> similarly 2-4 auto-releases per 10ms, and has ~250 explicit gets and
>> ~380 implicit per 10ms.
>>
>> Looking at the two I think the fact they manage to auto-release
>> relatively similarly, compared to a huge different in fw gets,
>> suggest burstyness. So I am not sure that any smarts with the
>> release period would be interesting. At least not without serious
>> power/perf measurements.
>>
>>> Having said that, the conversion to hrtimer seems sensible but to add
>>> tracking of the last access as opposed to first we either fallback to
>>> jiffie (in which case hrtimer is moot) or rely on ktime_get_raw() being
>>> fast enough for every register write. Hmm, my usual response to that has
>>> been if it matters we avoid the heavyweight macros and use the _FW
>>> interface - or even raw readl/writel.
>>>
>>> Could you try storing ktime_get_raw() on every access and rearming the
>>> timer if it expires before last-access + min-period?
>>
>> That would be similar to your patch from before my holiday, right?
>> As I said above, I did not notice much change with that approach.
>> Just extending the timeout has a much greater effect, but as I wrote
>> above, I am not sure we want to change it.
>
> There was just one bug in that patch in checking the expiration that
> makes a huge difference, but if you please add the discussion above to
> the commit log that would be invaluable.

I think I had a fixed version, would have to dig in branches now.

I will add more text to the commit.

Most importantly, do you think this would affect HZ=10 or HZ=250 kernels 
at all? Presumably if it could, someone would know today that there is a 
difference between the kernels based on HZ already?

Regards,

Tvrtko
Chris Wilson April 5, 2016, 10:44 a.m. UTC | #8
On Tue, Apr 05, 2016 at 11:02:15AM +0100, Tvrtko Ursulin wrote:
> 
> On 05/04/16 09:59, Chris Wilson wrote:
> >On Tue, Apr 05, 2016 at 09:54:58AM +0100, Tvrtko Ursulin wrote:
> >>
> >>On 04/04/16 19:58, Chris Wilson wrote:
> >>>On Mon, Apr 04, 2016 at 05:51:09PM +0100, Tvrtko Ursulin wrote:
> >>>>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>>
> >>>>Current implementation releases the forcewake at any time between
> >>>>straight away, and one jiffie from the last put, or first automatic
> >>>>grab.
> >>>
> >>>That isn't the problem though. The problem is that we set the timer on
> >>>first use rather than last use. All you are stating here is that by
> >>>lengthening the timeout on your system you reduce the number of times it
> >>>times out.
> >>
> >>It is true the reduction I see is due lengthening of the average timeout.
> >>
> >>But with regards to re-arming approach, I thought so initially
> >>myself, but then, if we expect bursty access then it shouldn't
> >>matter and the simplicity of doing it like it currently is better.
> >>
> >>Even in practice, I noticed the effect of lengthening the timeout is
> >>much greater than moving the arming to the last access. And to get
> >>to very few to none auto releases on busy workloads we need in the
> >>regions of 5ms, which would be a big change. Or maybe not if you
> >>consider HZ=100 kernels.
> >>
> >>It is very difficult to know what is actually better considering
> >>power between the CPU and GPU and performance. So I though the best
> >>would be to keep it similar to the current timings, just fix the
> >>dependency on HZ and also slack with hrtimers might help with
> >>something.
> >>
> >>As a final data point, explicit puts and auto releases seems to be
> >>relatively balanced in my testing. With this patch T-Rex for example
> >>auto-releases in the region of 3-4 times in 10ms, with around 5-10
> >>explicit gets, and 5-10 implicit gets in 10ms.
> >>
> >>A different, interrupt heavy workload (~20k irqs/sec) manages
> >>similarly 2-4 auto-releases per 10ms, and has ~250 explicit gets and
> >>~380 implicit per 10ms.
> >>
> >>Looking at the two I think the fact they manage to auto-release
> >>relatively similarly, compared to a huge different in fw gets,
> >>suggest burstyness. So I am not sure that any smarts with the
> >>release period would be interesting. At least not without serious
> >>power/perf measurements.
> >>
> >>>Having said that, the conversion to hrtimer seems sensible but to add
> >>>tracking of the last access as opposed to first we either fallback to
> >>>jiffie (in which case hrtimer is moot) or rely on ktime_get_raw() being
> >>>fast enough for every register write. Hmm, my usual response to that has
> >>>been if it matters we avoid the heavyweight macros and use the _FW
> >>>interface - or even raw readl/writel.
> >>>
> >>>Could you try storing ktime_get_raw() on every access and rearming the
> >>>timer if it expires before last-access + min-period?
> >>
> >>That would be similar to your patch from before my holiday, right?
> >>As I said above, I did not notice much change with that approach.
> >>Just extending the timeout has a much greater effect, but as I wrote
> >>above, I am not sure we want to change it.
> >
> >There was just one bug in that patch in checking the expiration that
> >makes a huge difference, but if you please add the discussion above to
> >the commit log that would be invaluable.
> 
> I think I had a fixed version, would have to dig in branches now.
> 
> I will add more text to the commit.
> 
> Most importantly, do you think this would affect HZ=10 or HZ=250
> kernels at all? Presumably if it could, someone would know today
> that there is a difference between the kernels based on HZ already?

Hmm, I switch quite regularly - that may explain why sometimes I see
fw_domains_get quite heavy in the profiles and other. But I have never
correlated that with changing .configs. And I don't keep stats for rc6
during stress tests - especially more notable is the lack of light
desktop load testing.

One interesting side-effect of htrimer is that iirc it is installed on
the local cpu and isn't run until we hit a preemption point (i.e. the
scheduler in my case) so for those who are running high latency low HZ
are likely unaffected at all!

As far as hrtimer is concerned, my only real worry is that if it is too
expensive - but since it is being increasingly widely used I shouldn't
worry myself too much about that (if it were to be a problem, it will
get fixed!).
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index dd187727c813..7d4c704d7d75 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -666,7 +666,7 @@  struct intel_uncore {
 		struct drm_i915_private *i915;
 		enum forcewake_domain_id id;
 		unsigned wake_count;
-		struct timer_list timer;
+		struct hrtimer timer;
 		i915_reg_t reg_set;
 		u32 val_set;
 		u32 val_clear;
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index ac1c545436af..76ac990de354 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -60,7 +60,11 @@  fw_domain_reset(const struct intel_uncore_forcewake_domain *d)
 static inline void
 fw_domain_arm_timer(struct intel_uncore_forcewake_domain *d)
 {
-	mod_timer_pinned(&d->timer, jiffies + 1);
+	d->wake_count++;
+	hrtimer_start_range_ns(&d->timer,
+			       ktime_set(0, NSEC_PER_MSEC),
+			       NSEC_PER_MSEC,
+			       HRTIMER_MODE_REL);
 }
 
 static inline void
@@ -224,9 +228,11 @@  static int __gen6_gt_wait_for_fifo(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
-static void intel_uncore_fw_release_timer(unsigned long arg)
+static enum hrtimer_restart
+intel_uncore_fw_release_timer(struct hrtimer *timer)
 {
-	struct intel_uncore_forcewake_domain *domain = (void *)arg;
+	struct intel_uncore_forcewake_domain *domain =
+	       container_of(timer, struct intel_uncore_forcewake_domain, timer);
 	unsigned long irqflags;
 
 	assert_rpm_device_not_suspended(domain->i915);
@@ -240,6 +246,8 @@  static void intel_uncore_fw_release_timer(unsigned long arg)
 							  1 << domain->id);
 
 	spin_unlock_irqrestore(&domain->i915->uncore.lock, irqflags);
+
+	return HRTIMER_NORESTART;
 }
 
 void intel_uncore_forcewake_reset(struct drm_device *dev, bool restore)
@@ -259,16 +267,16 @@  void intel_uncore_forcewake_reset(struct drm_device *dev, bool restore)
 		active_domains = 0;
 
 		for_each_fw_domain(domain, dev_priv, id) {
-			if (del_timer_sync(&domain->timer) == 0)
+			if (hrtimer_cancel(&domain->timer) == 0)
 				continue;
 
-			intel_uncore_fw_release_timer((unsigned long)domain);
+			intel_uncore_fw_release_timer(&domain->timer);
 		}
 
 		spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
 
 		for_each_fw_domain(domain, dev_priv, id) {
-			if (timer_pending(&domain->timer))
+			if (hrtimer_active(&domain->timer))
 				active_domains |= (1 << id);
 		}
 
@@ -491,7 +499,6 @@  static void __intel_uncore_forcewake_put(struct drm_i915_private *dev_priv,
 		if (--domain->wake_count)
 			continue;
 
-		domain->wake_count++;
 		fw_domain_arm_timer(domain);
 	}
 }
@@ -732,7 +739,6 @@  static inline void __force_wake_auto(struct drm_i915_private *dev_priv,
 			continue;
 		}
 
-		domain->wake_count++;
 		fw_domain_arm_timer(domain);
 	}
 
@@ -1150,7 +1156,8 @@  static void fw_domain_init(struct drm_i915_private *dev_priv,
 	d->i915 = dev_priv;
 	d->id = domain_id;
 
-	setup_timer(&d->timer, intel_uncore_fw_release_timer, (unsigned long)d);
+	hrtimer_init(&d->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	d->timer.function = intel_uncore_fw_release_timer;
 
 	dev_priv->uncore.fw_domains |= (1 << domain_id);