diff mbox

sched: Support current clocksource handling in fallback sched_clock().

Message ID 20090526230855.GA27218@linux-sh.org (mailing list archive)
State Superseded
Headers show

Commit Message

Paul Mundt May 26, 2009, 11:08 p.m. UTC
On Tue, May 26, 2009 at 10:17:02PM +0200, Thomas Gleixner wrote:
> On Tue, 26 May 2009, Peter Zijlstra wrote:
> > On Tue, 2009-05-26 at 16:31 +0200, Linus Walleij wrote:
> > > The definition of "rating" from the kerneldoc does not
> > > seem to imply that, it's a subjective measure AFAICT.
> 
>   Right, there is no rating threshold defined, which allows to deduce
>   that. The TSC on x86 which might be unreliable, but usable as
>   sched_clock has an initial rating of 300 which can be changed later
>   on to 0 when the TSC is unusable as a time of day source. In that
>   case clock is replaced by HPET which has a rating > 100 but is
>   definitely not a good choice for sched_clock
> 
> > > Else you might want an additional criteria, like
> > > cyc2ns(1) (much less than) jiffies_to_usecs(1)*1000
> > > (however you do that the best way)
> > > so you don't pick something
> > > that isn't substantially faster than the jiffy counter atleast?
> 
>   What we can do is add another flag to the clocksource e.g.
>   CLOCK_SOURCE_USE_FOR_SCHED_CLOCK and check this instead of the
>   rating.
> 
Ok, so based on this and John's locking concerns, how about something
like this? It doesn't handle the wrapping cases, but I wonder if we
really want to add that amount of logic to sched_clock() in the first
place. Clocksources that wrap frequently could either leave the flag
unset, or do something similar to the TSC code where the cyc2ns shift is
used. If this is something we want to handle generically, then I'll have
a go at generalizing the TSC cyc2ns scaling bits for the next spin.

---

 include/linux/clocksource.h |    2 ++
 kernel/sched_clock.c        |   22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

John Stultz May 26, 2009, 11:25 p.m. UTC | #1
On Wed, 2009-05-27 at 08:08 +0900, Paul Mundt wrote:
> On Tue, May 26, 2009 at 10:17:02PM +0200, Thomas Gleixner wrote:
> > On Tue, 26 May 2009, Peter Zijlstra wrote:
> > > On Tue, 2009-05-26 at 16:31 +0200, Linus Walleij wrote:
> > > > The definition of "rating" from the kerneldoc does not
> > > > seem to imply that, it's a subjective measure AFAICT.
> > 
> >   Right, there is no rating threshold defined, which allows to deduce
> >   that. The TSC on x86 which might be unreliable, but usable as
> >   sched_clock has an initial rating of 300 which can be changed later
> >   on to 0 when the TSC is unusable as a time of day source. In that
> >   case clock is replaced by HPET which has a rating > 100 but is
> >   definitely not a good choice for sched_clock
> > 
> > > > Else you might want an additional criteria, like
> > > > cyc2ns(1) (much less than) jiffies_to_usecs(1)*1000
> > > > (however you do that the best way)
> > > > so you don't pick something
> > > > that isn't substantially faster than the jiffy counter atleast?
> > 
> >   What we can do is add another flag to the clocksource e.g.
> >   CLOCK_SOURCE_USE_FOR_SCHED_CLOCK and check this instead of the
> >   rating.
> > 
> Ok, so based on this and John's locking concerns, how about something
> like this? It doesn't handle the wrapping cases, but I wonder if we
> really want to add that amount of logic to sched_clock() in the first
> place. Clocksources that wrap frequently could either leave the flag
> unset, or do something similar to the TSC code where the cyc2ns shift is
> used. If this is something we want to handle generically, then I'll have
> a go at generalizing the TSC cyc2ns scaling bits for the next spin.


Yea. So this is a little better. There's still a few other issues to
consider:

1) What if a clocksource is registered that has the _SCHED_CLOCK bit
set, but is not selected for timekeeping due it being unstable like the
TSC?

2) Conditionally returning jiffies if the lock is held seems troubling.
Might get some crazy values that way.

thanks
-john


> ---
> 
>  include/linux/clocksource.h |    2 ++
>  kernel/sched_clock.c        |   22 ++++++++++++++++++++++
>  2 files changed, 24 insertions(+)
> 
> diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
> index c56457c..cfd873e 100644
> --- a/include/linux/clocksource.h
> +++ b/include/linux/clocksource.h
> @@ -203,6 +203,7 @@ struct clocksource {
>  };
> 
>  extern struct clocksource *clock;	/* current clocksource */
> +extern spinlock_t clocksource_lock;
> 
>  /*
>   * Clock source flags bits::
> @@ -212,6 +213,7 @@ extern struct clocksource *clock;	/* current clocksource */
> 
>  #define CLOCK_SOURCE_WATCHDOG			0x10
>  #define CLOCK_SOURCE_VALID_FOR_HRES		0x20
> +#define CLOCK_SOURCE_USE_FOR_SCHED_CLOCK	0x40
> 
>  /* simplify initialization of mask field */
>  #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
> diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
> index e1d16c9..c7027cd 100644
> --- a/kernel/sched_clock.c
> +++ b/kernel/sched_clock.c
> @@ -30,6 +30,7 @@
>  #include <linux/percpu.h>
>  #include <linux/ktime.h>
>  #include <linux/sched.h>
> +#include <linux/clocksource.h>
> 
>  /*
>   * Scheduler clock - returns current time in nanosec units.
> @@ -38,6 +39,27 @@
>   */
>  unsigned long long __attribute__((weak)) sched_clock(void)
>  {
> +	/*
> +	 * Use the current clocksource when it becomes available later in
> +	 * the boot process. As this needs to be fast, we only make a
> +	 * single pass at grabbing the spinlock. If the clock is changing
> +	 * out from underneath us, fall back on jiffies and try it again
> +	 * the next time around.
> +	 */
> +	if (clock && _raw_spin_trylock(&clocksource_lock)) {
> +		/*
> +		 * Only use clocksources suitable for sched_clock()
> +		 */
> +		if (clock->flags & CLOCK_SOURCE_USE_FOR_SCHED_CLOCK) {
> +			cycle_t now = cyc2ns(clock, clocksource_read(clock));
> +			_raw_spin_unlock(&clocksource_lock);
> +			return now;
> +		}
> +
> +		_raw_spin_unlock(&clocksource_lock);
> +	}
> +
> +	/* If all else fails, fall back on jiffies */
>  	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
>  					* (NSEC_PER_SEC / HZ);
>  }

--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mundt May 26, 2009, 11:44 p.m. UTC | #2
On Tue, May 26, 2009 at 04:25:03PM -0700, john stultz wrote:
> On Wed, 2009-05-27 at 08:08 +0900, Paul Mundt wrote:
> > On Tue, May 26, 2009 at 10:17:02PM +0200, Thomas Gleixner wrote:
> > > On Tue, 26 May 2009, Peter Zijlstra wrote:
> > > > On Tue, 2009-05-26 at 16:31 +0200, Linus Walleij wrote:
> > > > > The definition of "rating" from the kerneldoc does not
> > > > > seem to imply that, it's a subjective measure AFAICT.
> > > 
> > >   Right, there is no rating threshold defined, which allows to deduce
> > >   that. The TSC on x86 which might be unreliable, but usable as
> > >   sched_clock has an initial rating of 300 which can be changed later
> > >   on to 0 when the TSC is unusable as a time of day source. In that
> > >   case clock is replaced by HPET which has a rating > 100 but is
> > >   definitely not a good choice for sched_clock
> > > 
> > > > > Else you might want an additional criteria, like
> > > > > cyc2ns(1) (much less than) jiffies_to_usecs(1)*1000
> > > > > (however you do that the best way)
> > > > > so you don't pick something
> > > > > that isn't substantially faster than the jiffy counter atleast?
> > > 
> > >   What we can do is add another flag to the clocksource e.g.
> > >   CLOCK_SOURCE_USE_FOR_SCHED_CLOCK and check this instead of the
> > >   rating.
> > > 
> > Ok, so based on this and John's locking concerns, how about something
> > like this? It doesn't handle the wrapping cases, but I wonder if we
> > really want to add that amount of logic to sched_clock() in the first
> > place. Clocksources that wrap frequently could either leave the flag
> > unset, or do something similar to the TSC code where the cyc2ns shift is
> > used. If this is something we want to handle generically, then I'll have
> > a go at generalizing the TSC cyc2ns scaling bits for the next spin.
> 
> 
> Yea. So this is a little better. There's still a few other issues to
> consider:
> 
> 1) What if a clocksource is registered that has the _SCHED_CLOCK bit
> set, but is not selected for timekeeping due it being unstable like the
> TSC?
> 
See, this is what I thought the rating information was useful for, as the
rating is subsequently dropped if it is not usable. But perhaps it makes
more sense to just clear the bit at the same time that the rating is
lowered once it turns out to be unstable.

> 2) Conditionally returning jiffies if the lock is held seems troubling.
> Might get some crazy values that way.
> 
What would you recommend instead? We do not want to spin here, and if we
are in the middle of changing clocksources and returning jiffies anyways,
then this same issue pops up in the current sched_clock() implementation
regardless of whether we are testing for lock contention or not.
Likewise, even if we were to spin, the same situation exists if the new
clocksource does not have the _SCHED_CLOCK bit set and we have to fall
back on jiffies anyways, doesn't it?

Put another way, and unless I'm missing something obvious, if we ignore
my changes to sched_clock(), how is your concern not applicable to case
where we are changing clocksources and using generic sched_clock() as it
is today?
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner May 26, 2009, 11:49 p.m. UTC | #3
On Wed, 27 May 2009, Paul Mundt wrote:

> On Tue, May 26, 2009 at 10:17:02PM +0200, Thomas Gleixner wrote:
> > On Tue, 26 May 2009, Peter Zijlstra wrote:
> > > On Tue, 2009-05-26 at 16:31 +0200, Linus Walleij wrote:
> > > > The definition of "rating" from the kerneldoc does not
> > > > seem to imply that, it's a subjective measure AFAICT.
> > 
> >   Right, there is no rating threshold defined, which allows to deduce
> >   that. The TSC on x86 which might be unreliable, but usable as
> >   sched_clock has an initial rating of 300 which can be changed later
> >   on to 0 when the TSC is unusable as a time of day source. In that
> >   case clock is replaced by HPET which has a rating > 100 but is
> >   definitely not a good choice for sched_clock
> > 
> > > > Else you might want an additional criteria, like
> > > > cyc2ns(1) (much less than) jiffies_to_usecs(1)*1000
> > > > (however you do that the best way)
> > > > so you don't pick something
> > > > that isn't substantially faster than the jiffy counter atleast?
> > 
> >   What we can do is add another flag to the clocksource e.g.
> >   CLOCK_SOURCE_USE_FOR_SCHED_CLOCK and check this instead of the
> >   rating.
> > 
> Ok, so based on this and John's locking concerns, how about something
> like this? It doesn't handle the wrapping cases, but I wonder if we
> really want to add that amount of logic to sched_clock() in the first
> place. Clocksources that wrap frequently could either leave the flag
> unset, or do something similar to the TSC code where the cyc2ns shift is
> used. If this is something we want to handle generically, then I'll have
> a go at generalizing the TSC cyc2ns scaling bits for the next spin.

Gah. There is no locking issue. As Peter explained before the
scheduler code can cope with some inaccurate value.

The wrap issue is completly academic. If the current clock source has
a wrap issue then it needs to be addressed anyway by frequent enough
wakeups to assure correctness of timekeeping and that makes it
suitable for the sched clock domain as well. Also the scheduler can
not hit a value which has not gone through the irq_enter() based
update after a long idle sleep.

So changing your previous patch from

   if (clock && clock->rating > 100)

to

   if (clock && (clock->flags & CLOCK_SOURCE_USE_FOR_SCHED_CLOCK))

is sufficient.

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner May 27, 2009, 12:18 a.m. UTC | #4
On Wed, 27 May 2009, Paul Mundt wrote:
> > Yea. So this is a little better. There's still a few other issues to
> > consider:
> > 
> > 1) What if a clocksource is registered that has the _SCHED_CLOCK bit
> > set, but is not selected for timekeeping due it being unstable like the
> > TSC?
> > 
> See, this is what I thought the rating information was useful for, as the
> rating is subsequently dropped if it is not usable. But perhaps it makes
> more sense to just clear the bit at the same time that the rating is
> lowered once it turns out to be unstable.

Stop worrying about TSC please. The x86 f*cked up timers need special
handling which is definitely not required for most of arch/*. x86
overrides that anyway and handles the TSC f*ckup in the
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK section of sched_clock.c which is
completely irrelevant to any architecture which has a sane set of
timers.

The only extra magic which is required to avoid that (sub)arch
maintainers need to specify a sched_clock() implementation to override
the weak generic one is really the simple

    if (clock && (clock->flags & CLOCKSOURCE_USE_FOR_SCHED_CLOCK))
       return ....

We need no locking there at all.

We have a workaround in place, which overrides the weak sched_clock()
implementation, to make x86 efficient, so why do we want to impose all
that x86 crap on folks which deal with architectures which got the
timers right ?

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Stultz May 27, 2009, 12:22 a.m. UTC | #5
On Wed, 2009-05-27 at 08:44 +0900, Paul Mundt wrote:
> On Tue, May 26, 2009 at 04:25:03PM -0700, john stultz wrote:
> > On Wed, 2009-05-27 at 08:08 +0900, Paul Mundt wrote:
> > > On Tue, May 26, 2009 at 10:17:02PM +0200, Thomas Gleixner wrote:
> > > > On Tue, 26 May 2009, Peter Zijlstra wrote:
> > > > > On Tue, 2009-05-26 at 16:31 +0200, Linus Walleij wrote:
> > > > > > The definition of "rating" from the kerneldoc does not
> > > > > > seem to imply that, it's a subjective measure AFAICT.
> > > > 
> > > >   Right, there is no rating threshold defined, which allows to deduce
> > > >   that. The TSC on x86 which might be unreliable, but usable as
> > > >   sched_clock has an initial rating of 300 which can be changed later
> > > >   on to 0 when the TSC is unusable as a time of day source. In that
> > > >   case clock is replaced by HPET which has a rating > 100 but is
> > > >   definitely not a good choice for sched_clock
> > > > 
> > > > > > Else you might want an additional criteria, like
> > > > > > cyc2ns(1) (much less than) jiffies_to_usecs(1)*1000
> > > > > > (however you do that the best way)
> > > > > > so you don't pick something
> > > > > > that isn't substantially faster than the jiffy counter atleast?
> > > > 
> > > >   What we can do is add another flag to the clocksource e.g.
> > > >   CLOCK_SOURCE_USE_FOR_SCHED_CLOCK and check this instead of the
> > > >   rating.
> > > > 
> > > Ok, so based on this and John's locking concerns, how about something
> > > like this? It doesn't handle the wrapping cases, but I wonder if we
> > > really want to add that amount of logic to sched_clock() in the first
> > > place. Clocksources that wrap frequently could either leave the flag
> > > unset, or do something similar to the TSC code where the cyc2ns shift is
> > > used. If this is something we want to handle generically, then I'll have
> > > a go at generalizing the TSC cyc2ns scaling bits for the next spin.
> > 
> > 
> > Yea. So this is a little better. There's still a few other issues to
> > consider:
> > 
> > 1) What if a clocksource is registered that has the _SCHED_CLOCK bit
> > set, but is not selected for timekeeping due it being unstable like the
> > TSC?
> > 
> See, this is what I thought the rating information was useful for, as the
> rating is subsequently dropped if it is not usable. But perhaps it makes
> more sense to just clear the bit at the same time that the rating is
> lowered once it turns out to be unstable.

Yes, if we're dropping a clocksource we should also drop the bit. That
shouldn't be a problem.

The point I was making, is that multiple clocksources may be registered
at one time (TSC, ACPI_PM, etc). But only one is being managed by the
timekeeping code (clock). So there may be the case where the
sched_clock() is different then the timekeeping clock (which is common
on x86). 

So I suspect we need a special hook that grabs the best _SCHED_CLOCK
clocksource (as computed at clocksource registration time) and provides
it to the generic sched_clock() interface.


> > 2) Conditionally returning jiffies if the lock is held seems troubling.
> > Might get some crazy values that way.
> > 
> What would you recommend instead? We do not want to spin here, and if we
> are in the middle of changing clocksources and returning jiffies anyways,
> then this same issue pops up in the current sched_clock() implementation
> regardless of whether we are testing for lock contention or not.
> Likewise, even if we were to spin, the same situation exists if the new
> clocksource does not have the _SCHED_CLOCK bit set and we have to fall
> back on jiffies anyways, doesn't it?
> 
> Put another way, and unless I'm missing something obvious, if we ignore
> my changes to sched_clock(), how is your concern not applicable to case
> where we are changing clocksources and using generic sched_clock() as it
> is today?

Well, Thomas' point that locking isn't necessary, as sched_clock()
doesn't have to be correct, is probably right. 

So, I think a get_sched_clocksource() interface would be ideal (if we
want to get academic at a later date, the pointer could be atomically
updated, and we'd keep it valid for some time via an rcu like method).

Additionally, you can set the jiffies clocksource as a _SCHED_CLOCK
clocksource and drop the jiffies fallback code completely.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mundt May 27, 2009, 12:26 a.m. UTC | #6
On Tue, May 26, 2009 at 05:22:10PM -0700, john stultz wrote:
> On Wed, 2009-05-27 at 08:44 +0900, Paul Mundt wrote:
> > What would you recommend instead? We do not want to spin here, and if we
> > are in the middle of changing clocksources and returning jiffies anyways,
> > then this same issue pops up in the current sched_clock() implementation
> > regardless of whether we are testing for lock contention or not.
> > Likewise, even if we were to spin, the same situation exists if the new
> > clocksource does not have the _SCHED_CLOCK bit set and we have to fall
> > back on jiffies anyways, doesn't it?
> > 
> > Put another way, and unless I'm missing something obvious, if we ignore
> > my changes to sched_clock(), how is your concern not applicable to case
> > where we are changing clocksources and using generic sched_clock() as it
> > is today?
> 
> Well, Thomas' point that locking isn't necessary, as sched_clock()
> doesn't have to be correct, is probably right. 
> 
> So, I think a get_sched_clocksource() interface would be ideal (if we
> want to get academic at a later date, the pointer could be atomically
> updated, and we'd keep it valid for some time via an rcu like method).
> 
> Additionally, you can set the jiffies clocksource as a _SCHED_CLOCK
> clocksource and drop the jiffies fallback code completely.
> 
I thought about that initially as well, but in the case of the jiffies
clocksource, that won't handle INITIAL_JIFFIES, which we want to subtract
to make printk times sane.
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner May 27, 2009, 12:27 a.m. UTC | #7
John,

On Tue, 26 May 2009, john stultz wrote:
> > See, this is what I thought the rating information was useful for, as the
> > rating is subsequently dropped if it is not usable. But perhaps it makes
> > more sense to just clear the bit at the same time that the rating is
> > lowered once it turns out to be unstable.
> 
> Yes, if we're dropping a clocksource we should also drop the bit. That
> shouldn't be a problem.
> 
> The point I was making, is that multiple clocksources may be registered
> at one time (TSC, ACPI_PM, etc). But only one is being managed by the
> timekeeping code (clock). So there may be the case where the
> sched_clock() is different then the timekeeping clock (which is common
> on x86). 
> 
> So I suspect we need a special hook that grabs the best _SCHED_CLOCK
> clocksource (as computed at clocksource registration time) and provides
> it to the generic sched_clock() interface.

this is not about x86 and its inferiour timer hardware
implementation. We talk about sane architectures which do not have
that problems at all. x86 takes a different code path and overrides
the generic weak sched_clock implememtation anyway. So what ?

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Stultz May 27, 2009, 1:09 a.m. UTC | #8
On Wed, 2009-05-27 at 09:26 +0900, Paul Mundt wrote:
> On Tue, May 26, 2009 at 05:22:10PM -0700, john stultz wrote:
> > On Wed, 2009-05-27 at 08:44 +0900, Paul Mundt wrote:
> > > What would you recommend instead? We do not want to spin here, and if we
> > > are in the middle of changing clocksources and returning jiffies anyways,
> > > then this same issue pops up in the current sched_clock() implementation
> > > regardless of whether we are testing for lock contention or not.
> > > Likewise, even if we were to spin, the same situation exists if the new
> > > clocksource does not have the _SCHED_CLOCK bit set and we have to fall
> > > back on jiffies anyways, doesn't it?
> > > 
> > > Put another way, and unless I'm missing something obvious, if we ignore
> > > my changes to sched_clock(), how is your concern not applicable to case
> > > where we are changing clocksources and using generic sched_clock() as it
> > > is today?
> > 
> > Well, Thomas' point that locking isn't necessary, as sched_clock()
> > doesn't have to be correct, is probably right. 
> > 
> > So, I think a get_sched_clocksource() interface would be ideal (if we
> > want to get academic at a later date, the pointer could be atomically
> > updated, and we'd keep it valid for some time via an rcu like method).
> > 
> > Additionally, you can set the jiffies clocksource as a _SCHED_CLOCK
> > clocksource and drop the jiffies fallback code completely.
> > 
> I thought about that initially as well, but in the case of the jiffies
> clocksource, that won't handle INITIAL_JIFFIES, which we want to subtract
> to make printk times sane.

Fair point, but that shouldn't be a big issue, we can fix that in the
jiffies clocksource read() implementation.

thanks
-john


--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index c56457c..cfd873e 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -203,6 +203,7 @@  struct clocksource {
 };
 
 extern struct clocksource *clock;	/* current clocksource */
+extern spinlock_t clocksource_lock;
 
 /*
  * Clock source flags bits::
@@ -212,6 +213,7 @@  extern struct clocksource *clock;	/* current clocksource */
 
 #define CLOCK_SOURCE_WATCHDOG			0x10
 #define CLOCK_SOURCE_VALID_FOR_HRES		0x20
+#define CLOCK_SOURCE_USE_FOR_SCHED_CLOCK	0x40
 
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9..c7027cd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -30,6 +30,7 @@ 
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/clocksource.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -38,6 +39,27 @@ 
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
+	/*
+	 * Use the current clocksource when it becomes available later in
+	 * the boot process. As this needs to be fast, we only make a
+	 * single pass at grabbing the spinlock. If the clock is changing
+	 * out from underneath us, fall back on jiffies and try it again
+	 * the next time around.
+	 */
+	if (clock && _raw_spin_trylock(&clocksource_lock)) {
+		/*
+		 * Only use clocksources suitable for sched_clock()
+		 */
+		if (clock->flags & CLOCK_SOURCE_USE_FOR_SCHED_CLOCK) {
+			cycle_t now = cyc2ns(clock, clocksource_read(clock));
+			_raw_spin_unlock(&clocksource_lock);
+			return now;
+		}
+
+		_raw_spin_unlock(&clocksource_lock);
+	}
+
+	/* If all else fails, fall back on jiffies */
 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
 					* (NSEC_PER_SEC / HZ);
 }