diff mbox series

[v5,02/23] kernel: Define gettimeofday vdso common code

Message ID 20190222122430.21180-3-vincenzo.frascino@arm.com (mailing list archive)
State New, archived
Headers show
Series Unify vDSOs across more architectures | expand

Commit Message

Vincenzo Frascino Feb. 22, 2019, 12:24 p.m. UTC
In the last few years we assisted to an explosion of vdso
implementations that mostly share similar code.

Try to unify the gettimeofday vdso implementation introducing
lib/vdso. The code contained in this library can ideally be
reused by all the architectures avoiding, where possible, code
duplication.

Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
---
 include/vdso/datapage.h |   1 +
 include/vdso/helpers.h  |  52 ++++++++++++
 include/vdso/types.h    |  39 +++++++++
 lib/Kconfig             |   5 ++
 lib/vdso/Kconfig        |  37 +++++++++
 lib/vdso/Makefile       |  22 +++++
 lib/vdso/gettimeofday.c | 175 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 331 insertions(+)
 create mode 100644 include/vdso/helpers.h
 create mode 100644 include/vdso/types.h
 create mode 100644 lib/vdso/Kconfig
 create mode 100644 lib/vdso/Makefile
 create mode 100644 lib/vdso/gettimeofday.c

Comments

Mark Rutland Feb. 22, 2019, 1:34 p.m. UTC | #1
On Fri, Feb 22, 2019 at 12:24:09PM +0000, Vincenzo Frascino wrote:
> In the last few years we assisted to an explosion of vdso
> implementations that mostly share similar code.
> 
> Try to unify the gettimeofday vdso implementation introducing
> lib/vdso. The code contained in this library can ideally be
> reused by all the architectures avoiding, where possible, code
> duplication.
> 
> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
> ---
>  include/vdso/datapage.h |   1 +
>  include/vdso/helpers.h  |  52 ++++++++++++
>  include/vdso/types.h    |  39 +++++++++
>  lib/Kconfig             |   5 ++
>  lib/vdso/Kconfig        |  37 +++++++++
>  lib/vdso/Makefile       |  22 +++++
>  lib/vdso/gettimeofday.c | 175 ++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 331 insertions(+)
>  create mode 100644 include/vdso/helpers.h
>  create mode 100644 include/vdso/types.h
>  create mode 100644 lib/vdso/Kconfig
>  create mode 100644 lib/vdso/Makefile
>  create mode 100644 lib/vdso/gettimeofday.c
> 
> diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
> index da346ad02b03..ff332fcba73c 100644
> --- a/include/vdso/datapage.h
> +++ b/include/vdso/datapage.h
> @@ -9,6 +9,7 @@
>  #include <linux/bits.h>
>  #include <linux/types.h>
>  #include <linux/time.h>
> +#include <vdso/types.h>
>  
>  #define VDSO_BASES	(CLOCK_TAI + 1)
>  #define VDSO_HRES	(BIT(CLOCK_REALTIME)		| \
> diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h
> new file mode 100644
> index 000000000000..511dea979f6b
> --- /dev/null
> +++ b/include/vdso/helpers.h
> @@ -0,0 +1,52 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __VDSO_HELPERS_H
> +#define __VDSO_HELPERS_H
> +
> +#ifdef __KERNEL__

Nit: __KERNEL__ guards can go.

> +
> +#ifndef __ASSEMBLY__
> +
> +#include <vdso/datapage.h>
> +
> +static __always_inline notrace u32 vdso_read_begin(const struct vdso_data *vd)

Rather than explicitly annotating all functions with notrace, can't we
disable instrumentation for all C files used for the vDSO using compiler
flags?

That would be more robust, and make the code less noisy to read.

> +{
> +	u32 seq;
> +
> +repeat:
> +	seq = READ_ONCE(vd->seq);
> +	if (seq & 1) {
> +		cpu_relax();
> +		goto repeat;
> +	}
> +
> +	smp_rmb();
> +	return seq;
> +}

You could simplify the repeat loop as:

	while ((seq = READ_ONCE(vd->seq)) & 1)
		cpu_relax();

> +
> +static __always_inline notrace u32 vdso_read_retry(const struct vdso_data *vd,
> +						   u32 start)
> +{
> +	u32 seq;
> +
> +	smp_rmb();
> +	seq = READ_ONCE(vd->seq);
> +	return seq != start;
> +}
> +
> +static __always_inline notrace void vdso_write_begin(struct vdso_data *vd)
> +{
> +	++vd->seq;
> +	smp_wmb();
> +}
> +
> +static __always_inline notrace void vdso_write_end(struct vdso_data *vd)
> +{
> +	smp_wmb();
> +	++vd->seq;
> +}

I realise this is what existing vDSO update code does, but I do think
that these should be using WRITE_ONCE() to perform the update, to ensure
that the write is not torn.

e.g. these should be:

static __always_inline notrace void vdso_write_begin(struct vdso_data *vd)
{
	WRITE_ONCE(vd->seq, vd->seq + 1);
	smp_wmb();
}

static __always_inline notrace void vdso_write_end(struct vdso_data *vd)
{
	smp_wmb();
	WRITE_ONCE(vd->seq, vd->seq + 1);
}

Otherwise, the compiler can validly tear updates to vd->seq, and there's
the possibility that a reader sees an inconsistent value for vd->seq,
and consequently uses inconsistent data read from the vdso data page.

[...]

> +#include <linux/types.h>
> +#include <linux/time.h>

Nit: please order headers alphabetically>

> +
> +/*
> + * The definitions below are required to overcome the limitations
> + * of time_t on 32 bit architectures, which overflows in 2038.
> + * The new code should use the replacements based on time64_t and
> + * timespec64.
> + *
> + * The abstraction below will be updated once the migration to
> + * time64_t is complete.
> + */
> +#ifdef CONFIG_GENERIC_VDSO_32
> +#define __vdso_timespec		old_timespec32
> +#define __vdso_timeval		old_timeval32
> +#else
> +#ifdef ENABLE_COMPAT_VDSO
> +#define __vdso_timespec		old_timespec32
> +#define __vdso_timeval		old_timeval32
> +#else
> +#define __vdso_timespec		__kernel_timespec
> +#define __vdso_timeval		__kernel_old_timeval
> +#endif /* CONFIG_COMPAT_VDSO */
> +#endif /* CONFIG_GENERIC_VDSO_32 */

I'm not sure what the comment is trying to say.

For a 64-bit kernel, does this affec the native vDSO, or just the compat
vDSO?

[...]
> +config HAVE_GENERIC_VDSO
> +	bool
> +	default n

IIRC, 'n' is the implicit default.

Thanks,
Mark.
Arnd Bergmann Feb. 22, 2019, 1:49 p.m. UTC | #2
On Fri, Feb 22, 2019 at 1:25 PM Vincenzo Frascino
<vincenzo.frascino@arm.com> wrote:

> +/*
> + * The definitions below are required to overcome the limitations
> + * of time_t on 32 bit architectures, which overflows in 2038.
> + * The new code should use the replacements based on time64_t and
> + * timespec64.
> + *
> + * The abstraction below will be updated once the migration to
> + * time64_t is complete.
> + */
> +#ifdef CONFIG_GENERIC_VDSO_32
> +#define __vdso_timespec                old_timespec32
> +#define __vdso_timeval         old_timeval32
> +#else
> +#ifdef ENABLE_COMPAT_VDSO
> +#define __vdso_timespec                old_timespec32
> +#define __vdso_timeval         old_timeval32
> +#else
> +#define __vdso_timespec                __kernel_timespec
> +#define __vdso_timeval         __kernel_old_timeval
> +#endif /* CONFIG_COMPAT_VDSO */
> +#endif /* CONFIG_GENERIC_VDSO_32 */

I don't think we need __vdso_timeval at all, just use
__kernel_old_timeval everywhere. There is no generic
64-bit timeval type in the kernel, and there won't be because
gettimeofday() is deprecated.

For __vdso_timespec, I see how you ended up with this
redefinition, and it makes the current version of your patches
easier, but I fear it will in turn make it harder to add the
__kernel_old_timeval based variant.

What I'd prefer to see here is to always use the fixed length
types everywhere in the code, such as

static notrace int __cvdso_clock_gettime64(clockid_t clock,
                                        struct __kernel_timespec *ts)
{
... /* normal clock_gettime using 64-bit times */
}

static notrace int __cvdso_clock_gettime32(clockid_t clock,
                                        struct old_timespec32 *ts32)
{
     struct __kernel_timespec ts;
     int ret = __cvdso_clock_gettime64(clock, &ts);

     ts32->tv_sec = ts->tv_sec;
     ts32->tv_nsec = ts->tv_nsec;

     return ret;
}

and then have two different versions for 32-bit and 64-bit
architectures, calling one or the other function of the
generic code.

        Arnd
Arnd Bergmann Feb. 22, 2019, 2:36 p.m. UTC | #3
On Fri, Feb 22, 2019 at 2:49 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Fri, Feb 22, 2019 at 1:25 PM Vincenzo Frascino <vincenzo.frascino@arm.com> wrote:
> For __vdso_timespec, I see how you ended up with this
> redefinition, and it makes the current version of your patches
> easier, but I fear it will in turn make it harder to add the
> __kernel_old_timeval based variant.

I meant __kernel_timespec here, not __kernel_old_timeval.

      Arnd
Thomas Gleixner Feb. 23, 2019, 10:34 a.m. UTC | #4
On Fri, 22 Feb 2019, Vincenzo Frascino wrote:
> +static notrace int __cvdso_clock_getres(clockid_t clock,
> +					struct __vdso_timespec *res)
> +{
> +	u64 sec, ns;
> +	u32 msk;
> +
> +	/* Check for negative values or invalid clocks */
> +	if (unlikely((u32) clock >= MAX_CLOCKS))
> +		goto fallback;
> +
> +	/*
> +	 * Convert the clockid to a bitmask and use it to check which
> +	 * clocks are handled in the VDSO directly.
> +	 */
> +	msk = 1U << clock;
> +	if (msk & VDSO_HRES) {
> +		/*
> +		 * Preserves the behaviour of posix_get_hrtimer_res().
> +		 */

So much for the theory.

> +		sec = 0;
> +		ns = MONOTONIC_RES_NSEC;

posix_get_hrtimer_res() does:

	sec = 0;
	ns = hrtimer_resolution;

and hrtimer_resolution depends on the enablement of high resolution timers
either compile or run time.

So you need to have a copy of hrtimer_resolution in the vdso data and use
that.

> +	} else if (msk & VDSO_COARSE) {
> +		/*
> +		 * Preserves the behaviour of posix_get_coarse_res().
> +		 */
> +		ns = LOW_RES_NSEC;
> +		sec = __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);

Do we allow CONFIG_HZ = 1?

Thanks,

	tglx
Thomas Gleixner Feb. 23, 2019, 5:31 p.m. UTC | #5
On Fri, 22 Feb 2019, Vincenzo Frascino wrote:
> +static notrace int do_hres(const struct vdso_data *vd,
> +			   clockid_t clk,
> +			   struct __vdso_timespec *ts)
> +{
> +	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
> +	u64 cycles, last, sec, ns;
> +	u32 seq, cs_index = CLOCKSOURCE_MONO;
> +
> +	if (clk == CLOCK_MONOTONIC_RAW)
> +		cs_index = CLOCKSOURCE_RAW;

Uuurgh. So you create an array with 16 members and then use two. This code
is really optimized and now you add not only the pointless array, you also
need the extra index plus another conditional. Not to talk about the cache
impact which makes things even worse. In the x86 implementation we have:

       u32 		seq;			 +  0
       int		mode;			 +  4
       u64		mask;			 +  8
       u32		mult;			 + 16
       u32		shift;			 + 20
       struct vgtod_ts	basetimer[VGTOD_BASES];  + 24

Each basetime array member occupies 16 bytes. So

	CLOCK_REALTIME		+ 24
	CLOCK_MONOTONIC		+ 40
	..
		cacheline boundary		
	..
	CLOCK_REALTIME_COARSE	+ 104
	CLOCK_MONOTONIC_COARSE	+ 120   <- cacheline boundary
	CLOCK_BOOTTIME		+ 136
	CLOCK_REALTIME_ALARM	+ 152
	CLOCK_BOOTTIME_ALARM	+ 168
       
So the most used clocks REALTIME/MONO are in the first cacheline.

So with your scheme the thing becomes

       u32 		seq;			 +   0
       int		mode;			 +   4
       struct cs	cs[16]			 +   8
       struct vgtod_ts	basetimer[VGTOD_BASES];  + 264

and 

	CLOCK_REALTIME		+ 264
	CLOCK_MONOTONIC		+ 280

IOW, the most important clocks touch TWO cachelines now which are not even
adjacent. No, they are 256 bytes apart, which really sucks for prefetching.

We're surely not going to sacrify the performance which we carefully tuned
in that code just to support MONO_RAW. The solution I showed you in the
other reply does not have these problems at all.

It's easy enough to benchmark these implementations and without trying I'm
pretty sure that you can see the performance drop nicely. Please do so next
time and provide the numbers in the changelogs.

Thanks,

	tglx
Vincenzo Frascino Feb. 25, 2019, 2:09 p.m. UTC | #6
Hi Thomas,

thank you for your review.

On 23/02/2019 10:34, Thomas Gleixner wrote:
> On Fri, 22 Feb 2019, Vincenzo Frascino wrote:
>> +static notrace int __cvdso_clock_getres(clockid_t clock,
>> +					struct __vdso_timespec *res)
>> +{
>> +	u64 sec, ns;
>> +	u32 msk;
>> +
>> +	/* Check for negative values or invalid clocks */
>> +	if (unlikely((u32) clock >= MAX_CLOCKS))
>> +		goto fallback;
>> +
>> +	/*
>> +	 * Convert the clockid to a bitmask and use it to check which
>> +	 * clocks are handled in the VDSO directly.
>> +	 */
>> +	msk = 1U << clock;
>> +	if (msk & VDSO_HRES) {
>> +		/*
>> +		 * Preserves the behaviour of posix_get_hrtimer_res().
>> +		 */
> 
> So much for the theory.
> 
>> +		sec = 0;
>> +		ns = MONOTONIC_RES_NSEC;
> 
> posix_get_hrtimer_res() does:
> 
> 	sec = 0;
> 	ns = hrtimer_resolution;
> 
> and hrtimer_resolution depends on the enablement of high resolution timers
> either compile or run time.
> 
> So you need to have a copy of hrtimer_resolution in the vdso data and use
> that.
> 

I agree, MONOTONIC_RES_NSEC can be HIGH_RES_NSEC or LOW_RES_NSEC depending on
the HIGH_RES_TIMERS configuration option, but does not cover the run time
switch. I will add a copy of hrtimer_resolution in the vdso data in the next
iteration of the patches.

I had a look at the other implementations as well, and it seems that all the
architectures that currently implement getres() make the same wrong assumption I
made. I am going to provide a separate patch set that targets this.

>> +	} else if (msk & VDSO_COARSE) {
>> +		/*
>> +		 * Preserves the behaviour of posix_get_coarse_res().
>> +		 */
>> +		ns = LOW_RES_NSEC;
>> +		sec = __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
> 
> Do we allow CONFIG_HZ = 1?
> 

I had a closer look at it today and seems that jiffies.h supports CONFIG_HZ=12
as a minimum case. Hence I think it should be safe to remove __iter_div_u64_rem
and set sec=0 in this case.

> Thanks,
> 
> 	tglx
>
Vincenzo Frascino Feb. 27, 2019, 1:47 p.m. UTC | #7
Hi Thomas,

On 23/02/2019 17:31, Thomas Gleixner wrote:
> On Fri, 22 Feb 2019, Vincenzo Frascino wrote:
>> +static notrace int do_hres(const struct vdso_data *vd,
>> +			   clockid_t clk,
>> +			   struct __vdso_timespec *ts)
>> +{
>> +	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
>> +	u64 cycles, last, sec, ns;
>> +	u32 seq, cs_index = CLOCKSOURCE_MONO;
>> +
>> +	if (clk == CLOCK_MONOTONIC_RAW)
>> +		cs_index = CLOCKSOURCE_RAW;
> 
> Uuurgh. So you create an array with 16 members and then use two. This code
> is really optimized and now you add not only the pointless array, you also
> need the extra index plus another conditional. Not to talk about the cache
> impact which makes things even worse. In the x86 implementation we have:
> 
>        u32 		seq;			 +  0
>        int		mode;			 +  4
>        u64		mask;			 +  8
>        u32		mult;			 + 16
>        u32		shift;			 + 20
>        struct vgtod_ts	basetimer[VGTOD_BASES];  + 24
> 
> Each basetime array member occupies 16 bytes. So
> 
> 	CLOCK_REALTIME		+ 24
> 	CLOCK_MONOTONIC		+ 40
> 	..
> 		cacheline boundary		
> 	..
> 	CLOCK_REALTIME_COARSE	+ 104
> 	CLOCK_MONOTONIC_COARSE	+ 120   <- cacheline boundary
> 	CLOCK_BOOTTIME		+ 136
> 	CLOCK_REALTIME_ALARM	+ 152
> 	CLOCK_BOOTTIME_ALARM	+ 168
>        
> So the most used clocks REALTIME/MONO are in the first cacheline.
> 
> So with your scheme the thing becomes
> 
>        u32 		seq;			 +   0
>        int		mode;			 +   4
>        struct cs	cs[16]			 +   8
>        struct vgtod_ts	basetimer[VGTOD_BASES];  + 264
> 
> and 
> 
> 	CLOCK_REALTIME		+ 264
> 	CLOCK_MONOTONIC		+ 280
>

The clocksource array has two elements (CLOCKSOURCE_RAW, CLOCKSOURCE_MONO) and
the situation with my scheme should be the following:
	u32		seq:			+    0
	s32		clock_mode;		+    4
	u64		cycle_last;		+    8
	struct vdso_cs	cs[2];			+    16
	struct vdso_ts	basetime[VDSO_BASES];	+    48

which I agree makes still things a bit worse.

Assuming L1_CACHE_SHIFT == 6:

	CLOCK_REALTIME			+    48
	...
	cache boundary
	...
	CLOCK_MONOTONIC			+    64
	CLOCK_PROCESS_CPUTIME_ID	+    80
	CLOCK_THREAD_CPUTIME_ID		+    96
	CLOCK_MONOTONIC_RAW		+    112
	...
	cache boundary
	...
	CLOCK_REALTIME_COARSE		+    128
	CLOCK_MONOTONIC_COARSE		+    144
	CLOCK_BOOTTIME			+    160
	CLOCK_REALTIME_ALARM 		+    172
	CLOCK_BOOTTIME_ALARM		+    188
	...

> IOW, the most important clocks touch TWO cachelines now which are not even
> adjacent. No, they are 256 bytes apart, which really sucks for prefetching.
> 
> We're surely not going to sacrify the performance which we carefully tuned
> in that code just to support MONO_RAW. The solution I showed you in the
> other reply does not have these problems at all.
> 
> It's easy enough to benchmark these implementations and without trying I'm
> pretty sure that you can see the performance drop nicely. Please do so next
> time and provide the numbers in the changelogs.
> 

I did run some benchmarks this morning to quantify the performance impact and
seems that using vdsotest[1] the difference in between a stock linux kernel
5.0.0-rc7 and one that has unified vDSO, running on my x86 machine (Xeon Gold
5120T), is below 1%. Please find the results below, I will add them as well to
the next changelog.

[1] https://github.com/nathanlynch/vdsotest

> Thanks,
> 
> 	tglx
>
Vincenzo Frascino Feb. 27, 2019, 2:52 p.m. UTC | #8
Hi Arnd,

thank you for your review.

On 22/02/2019 13:49, Arnd Bergmann wrote:
> On Fri, Feb 22, 2019 at 1:25 PM Vincenzo Frascino
> <vincenzo.frascino@arm.com> wrote:
> 
>> +/*
>> + * The definitions below are required to overcome the limitations
>> + * of time_t on 32 bit architectures, which overflows in 2038.
>> + * The new code should use the replacements based on time64_t and
>> + * timespec64.
>> + *
>> + * The abstraction below will be updated once the migration to
>> + * time64_t is complete.
>> + */
>> +#ifdef CONFIG_GENERIC_VDSO_32
>> +#define __vdso_timespec                old_timespec32
>> +#define __vdso_timeval         old_timeval32
>> +#else
>> +#ifdef ENABLE_COMPAT_VDSO
>> +#define __vdso_timespec                old_timespec32
>> +#define __vdso_timeval         old_timeval32
>> +#else
>> +#define __vdso_timespec                __kernel_timespec
>> +#define __vdso_timeval         __kernel_old_timeval
>> +#endif /* CONFIG_COMPAT_VDSO */
>> +#endif /* CONFIG_GENERIC_VDSO_32 */
> 
> I don't think we need __vdso_timeval at all, just use
> __kernel_old_timeval everywhere. There is no generic
> 64-bit timeval type in the kernel, and there won't be because
> gettimeofday() is deprecated.
> 

Ok, I will update my implementation accordingly in v6.

> For __vdso_timespec, I see how you ended up with this
> redefinition, and it makes the current version of your patches
> easier, but I fear it will in turn make it harder to add the
> __kernel_old_timeval based variant.
> 

What is __kernel_old_timespec (based on you next email)? Why do you think it
will make harder to add the new variants?
Thomas Gleixner Feb. 27, 2019, 3:49 p.m. UTC | #9
Vincenzo,

On Wed, 27 Feb 2019, Vincenzo Frascino wrote:
> 
> The clocksource array has two elements (CLOCKSOURCE_RAW, CLOCKSOURCE_MONO) and
> the situation with my scheme should be the following:

Oops. I misread the patch, but still...

> 	u32		seq:			+    0
> 	s32		clock_mode;		+    4
> 	u64		cycle_last;		+    8
> 	struct vdso_cs	cs[2];			+    16
> 	struct vdso_ts	basetime[VDSO_BASES];	+    48
> 
> which I agree makes still things a bit worse.

> > It's easy enough to benchmark these implementations and without trying I'm
> > pretty sure that you can see the performance drop nicely. Please do so next
> > time and provide the numbers in the changelogs.
> > 
> 
> I did run some benchmarks this morning to quantify the performance impact and
> seems that using vdsotest[1] the difference in between a stock linux kernel
> 5.0.0-rc7 and one that has unified vDSO, running on my x86 machine (Xeon Gold
> 5120T), is below 1%. Please find the results below, I will add them as well to
> the next changelog.

I have some doubts about 1%.
			    	   NEW	STOCK
clock-gettime-monotonic:    vdso:  31	 28    ~ 10% slower
clock-gettime-realtime:     vdso:  32    29    ~ 10% slower

Thanks,

	tglx
Vincenzo Frascino Feb. 27, 2019, 4:06 p.m. UTC | #10
On 27/02/2019 15:49, Thomas Gleixner wrote:
> Vincenzo,
> 
> On Wed, 27 Feb 2019, Vincenzo Frascino wrote:
>>
>> The clocksource array has two elements (CLOCKSOURCE_RAW, CLOCKSOURCE_MONO) and
>> the situation with my scheme should be the following:
> 
> Oops. I misread the patch, but still...
> 
>> 	u32		seq:			+    0
>> 	s32		clock_mode;		+    4
>> 	u64		cycle_last;		+    8
>> 	struct vdso_cs	cs[2];			+    16
>> 	struct vdso_ts	basetime[VDSO_BASES];	+    48
>>
>> which I agree makes still things a bit worse.
> 
>>> It's easy enough to benchmark these implementations and without trying I'm
>>> pretty sure that you can see the performance drop nicely. Please do so next
>>> time and provide the numbers in the changelogs.
>>>
>>
>> I did run some benchmarks this morning to quantify the performance impact and
>> seems that using vdsotest[1] the difference in between a stock linux kernel
>> 5.0.0-rc7 and one that has unified vDSO, running on my x86 machine (Xeon Gold
>> 5120T), is below 1%. Please find the results below, I will add them as well to
>> the next changelog.
> 
> I have some doubts about 1%.
> 			    	   NEW	STOCK
> clock-gettime-monotonic:    vdso:  31	 28    ~ 10% slower
> clock-gettime-realtime:     vdso:  32    29    ~ 10% slower
>

Sorry, there was an error in my script that I used to extract percentages. I
agree! Luckily I shared the numbers. Thanks Thomas.

> Thanks,
> 
> 	tglx
>
Arnd Bergmann Feb. 28, 2019, 9:29 a.m. UTC | #11
On Wed, Feb 27, 2019 at 3:52 PM Vincenzo Frascino
<vincenzo.frascino@arm.com> wrote:
> On 22/02/2019 13:49, Arnd Bergmann wrote:
> > On Fri, Feb 22, 2019 at 1:25 PM Vincenzo Frascino
> > <vincenzo.frascino@arm.com> wrote:
> >
> >> +/*
> >> + * The definitions below are required to overcome the limitations
> >> + * of time_t on 32 bit architectures, which overflows in 2038.
> >> + * The new code should use the replacements based on time64_t and
> >> + * timespec64.
> >> + *
> >> + * The abstraction below will be updated once the migration to
> >> + * time64_t is complete.
> >> + */
> >> +#ifdef CONFIG_GENERIC_VDSO_32
> >> +#define __vdso_timespec                old_timespec32
> >> +#define __vdso_timeval         old_timeval32
> >> +#else
> >> +#ifdef ENABLE_COMPAT_VDSO
> >> +#define __vdso_timespec                old_timespec32
> >> +#define __vdso_timeval         old_timeval32
> >> +#else
> >> +#define __vdso_timespec                __kernel_timespec
> >> +#define __vdso_timeval         __kernel_old_timeval
> >> +#endif /* CONFIG_COMPAT_VDSO */
> >> +#endif /* CONFIG_GENERIC_VDSO_32 */

> > For __vdso_timespec, I see how you ended up with this
> > redefinition, and it makes the current version of your patches
> > easier, but I fear it will in turn make it harder to add the
> > __kernel_old_timeval based variant.
> >
>
> What is __kernel_old_timespec (based on you next email)? Why do you think it
> will make harder to add the new variants?

I mean you should ideally use the types that you have listed above
directly, without the abstraction.

In the long run I want one set of functions using __kernel_timespec that
implements the 64-bit time interfaces on both 32-bit and 64-bit kernels,
including the compat vdso, and another set using old_timespec32 for
just the native 32-bit and compat version.

This would match what we do in the normal system calls (in linux-5.1+),
where we always have the regular implementation use 64-bit types
only, and have an optional _time32 version that is used for existing
32-bit user space, on both native 32 bit kernels and compat syscalls
on 64-bit.

       Arnd
diff mbox series

Patch

diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index da346ad02b03..ff332fcba73c 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -9,6 +9,7 @@ 
 #include <linux/bits.h>
 #include <linux/types.h>
 #include <linux/time.h>
+#include <vdso/types.h>
 
 #define VDSO_BASES	(CLOCK_TAI + 1)
 #define VDSO_HRES	(BIT(CLOCK_REALTIME)		| \
diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h
new file mode 100644
index 000000000000..511dea979f6b
--- /dev/null
+++ b/include/vdso/helpers.h
@@ -0,0 +1,52 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_HELPERS_H
+#define __VDSO_HELPERS_H
+
+#ifdef __KERNEL__
+
+#ifndef __ASSEMBLY__
+
+#include <vdso/datapage.h>
+
+static __always_inline notrace u32 vdso_read_begin(const struct vdso_data *vd)
+{
+	u32 seq;
+
+repeat:
+	seq = READ_ONCE(vd->seq);
+	if (seq & 1) {
+		cpu_relax();
+		goto repeat;
+	}
+
+	smp_rmb();
+	return seq;
+}
+
+static __always_inline notrace u32 vdso_read_retry(const struct vdso_data *vd,
+						   u32 start)
+{
+	u32 seq;
+
+	smp_rmb();
+	seq = READ_ONCE(vd->seq);
+	return seq != start;
+}
+
+static __always_inline notrace void vdso_write_begin(struct vdso_data *vd)
+{
+	++vd->seq;
+	smp_wmb();
+}
+
+static __always_inline notrace void vdso_write_end(struct vdso_data *vd)
+{
+	smp_wmb();
+	++vd->seq;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
+
+#endif /* __VDSO_HELPERS_H */
diff --git a/include/vdso/types.h b/include/vdso/types.h
new file mode 100644
index 000000000000..f456a0a6a2e1
--- /dev/null
+++ b/include/vdso/types.h
@@ -0,0 +1,39 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_TYPES_H
+#define __VDSO_TYPES_H
+
+#ifdef __KERNEL__
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/time.h>
+
+/*
+ * The definitions below are required to overcome the limitations
+ * of time_t on 32 bit architectures, which overflows in 2038.
+ * The new code should use the replacements based on time64_t and
+ * timespec64.
+ *
+ * The abstraction below will be updated once the migration to
+ * time64_t is complete.
+ */
+#ifdef CONFIG_GENERIC_VDSO_32
+#define __vdso_timespec		old_timespec32
+#define __vdso_timeval		old_timeval32
+#else
+#ifdef ENABLE_COMPAT_VDSO
+#define __vdso_timespec		old_timespec32
+#define __vdso_timeval		old_timeval32
+#else
+#define __vdso_timespec		__kernel_timespec
+#define __vdso_timeval		__kernel_old_timeval
+#endif /* CONFIG_COMPAT_VDSO */
+#endif /* CONFIG_GENERIC_VDSO_32 */
+
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
+
+#endif /* __VDSO_TYPES_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index a9e56539bd11..dff3e3c782da 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -565,6 +565,11 @@  config OID_REGISTRY
 config UCS2_STRING
         tristate
 
+#
+# generic vdso
+#
+source "lib/vdso/Kconfig"
+
 source "lib/fonts/Kconfig"
 
 config SG_SPLIT
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
new file mode 100644
index 000000000000..34d91f952d70
--- /dev/null
+++ b/lib/vdso/Kconfig
@@ -0,0 +1,37 @@ 
+# SPDX-License-Identifier: GPL-2.0
+
+config HAVE_GENERIC_VDSO
+	bool
+	default n
+
+if HAVE_GENERIC_VDSO
+
+config GENERIC_GETTIMEOFDAY
+	bool
+	help
+	  This is a generic implementation of gettimeofday vdso.
+	  Each architecture that enables this feature has to
+	  provide the fallback implementation.
+
+config GENERIC_VDSO_32
+	bool
+	depends on GENERIC_GETTIMEOFDAY && !64BIT
+	help
+	  This config option helps to avoid possible performance issues
+	  in 32 bit only architectures.
+
+config GENERIC_COMPAT_VDSO
+	bool
+	help
+	  This config option enables the compat VDSO layer.
+
+config CROSS_COMPILE_COMPAT_VDSO
+	string "32 bit Toolchain prefix for compat vDSO"
+	default ""
+	depends on GENERIC_COMPAT_VDSO
+	help
+	  Defines the cross-compiler prefix for compiling compat vDSO.
+	  If a 64 bit compiler (i.e. x86_64) can compile the VDSO for
+	  32 bit, it does not need to define this parameter.
+
+endif
diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile
new file mode 100644
index 000000000000..c415a685d61b
--- /dev/null
+++ b/lib/vdso/Makefile
@@ -0,0 +1,22 @@ 
+# SPDX-License-Identifier: GPL-2.0
+
+GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH))
+
+c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c)
+
+# This cmd checks that the vdso library does not contain absolute relocation
+# It has to be called after the linking of the vdso library and requires it
+# as a parameter.
+#
+# $(ARCH_REL_TYPE_ABS) is defined in the arch specific makefile and corresponds
+# to the absolute relocation types printed by "objdump -R" and accepted by the
+# dynamic linker.
+ifndef ARCH_REL_TYPE_ABS
+$(error ARCH_REL_TYPE_ABS is not set)
+endif
+
+quiet_cmd_vdso_check = VDSOCHK $@
+      cmd_vdso_check = if $(OBJDUMP) -R $@ | egrep -h "$(ARCH_REL_TYPE_ABS)"; \
+		       then (echo >&2 "$@: dynamic relocations are not supported"; \
+			     rm -f $@; /bin/false); fi
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
new file mode 100644
index 000000000000..39f92f7d3218
--- /dev/null
+++ b/lib/vdso/gettimeofday.c
@@ -0,0 +1,175 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic userspace implementations of gettimeofday() and similar.
+ */
+#include <linux/compiler.h>
+#include <linux/math64.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/hrtimer.h>
+#include <vdso/datapage.h>
+#include <vdso/helpers.h>
+
+/*
+ * The generic vDSO implementation requires that gettimeofday.h
+ * provides:
+ * - __arch_get_vdso_data(): to get the vdso datapage.
+ * - __arch_get_hw_counter(): to get the hw counter based on the
+ *   clock_mode.
+ * - gettimeofday_fallback(): fallback for gettimeofday.
+ * - clock_gettime_fallback(): fallback for clock_gettime.
+ * - clock_getres_fallback(): fallback for clock_getres.
+ */
+#include <asm/vdso/gettimeofday.h>
+
+static notrace int do_hres(const struct vdso_data *vd,
+			   clockid_t clk,
+			   struct __vdso_timespec *ts)
+{
+	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
+	u64 cycles, last, sec, ns;
+	u32 seq, cs_index = CLOCKSOURCE_MONO;
+
+	if (clk == CLOCK_MONOTONIC_RAW)
+		cs_index = CLOCKSOURCE_RAW;
+
+	do {
+		seq = vdso_read_begin(vd);
+		cycles = __arch_get_hw_counter(vd->clock_mode) &
+			vd->cs[cs_index].mask;
+		ns = vdso_ts->nsec;
+		last = vd->cycle_last;
+		if (unlikely((s64)cycles < 0))
+			return clock_gettime_fallback(clk, ts);
+		if (cycles > last)
+			ns += (cycles - last) * vd->cs[cs_index].mult;
+		ns >>= vd->cs[cs_index].shift;
+		sec = vdso_ts->sec;
+	} while (unlikely(vdso_read_retry(vd, seq)));
+
+	/*
+	 * Do this outside the loop: a race inside the loop could result
+	 * in __iter_div_u64_rem() being extremely slow.
+	 */
+	ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return 0;
+}
+
+static notrace void do_coarse(const struct vdso_data *vd,
+			      clockid_t clk,
+			      struct __vdso_timespec *ts)
+{
+	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
+	u32 seq;
+
+	do {
+		seq = vdso_read_begin(vd);
+		ts->tv_sec = vdso_ts->sec;
+		ts->tv_nsec = vdso_ts->nsec;
+	} while (unlikely(vdso_read_retry(vd, seq)));
+}
+
+static notrace int __cvdso_clock_gettime(clockid_t clock,
+					 struct __vdso_timespec *ts)
+{
+	const struct vdso_data *vd = __arch_get_vdso_data();
+	u32 msk;
+
+	/* Check for negative values or invalid clocks */
+	if (unlikely((u32) clock >= MAX_CLOCKS))
+		goto fallback;
+
+	/*
+	 * Convert the clockid to a bitmask and use it to check which
+	 * clocks are handled in the VDSO directly.
+	 */
+	msk = 1U << clock;
+	if (likely(msk & VDSO_HRES)) {
+		return do_hres(vd, clock, ts);
+	} else if (msk & VDSO_COARSE) {
+		do_coarse(vd, clock, ts);
+		return 0;
+	}
+fallback:
+	return clock_gettime_fallback(clock, ts);
+}
+
+static notrace int __cvdso_gettimeofday(struct __vdso_timeval *tv,
+					struct timezone *tz)
+{
+	const struct vdso_data *vd = __arch_get_vdso_data();
+
+	if (likely(tv != NULL)) {
+		struct __vdso_timespec ts;
+
+		if (do_hres(vd, CLOCK_REALTIME, &ts))
+			return gettimeofday_fallback(tv, tz);
+
+		tv->tv_sec = ts.tv_sec;
+		tv->tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+	}
+
+	if (unlikely(tz != NULL)) {
+		tz->tz_minuteswest = vd->tz_minuteswest;
+		tz->tz_dsttime = vd->tz_dsttime;
+	}
+
+	return 0;
+}
+
+#ifdef VDSO_HAS_TIME
+static notrace time_t __cvdso_time(time_t *time)
+{
+	const struct vdso_data *vd = __arch_get_vdso_data();
+	time_t t = READ_ONCE(vd->basetime[CLOCK_REALTIME].sec);
+
+	if (time)
+		*time = t;
+
+	return t;
+}
+#endif /* VDSO_HAS_TIME */
+
+static notrace int __cvdso_clock_getres(clockid_t clock,
+					struct __vdso_timespec *res)
+{
+	u64 sec, ns;
+	u32 msk;
+
+	/* Check for negative values or invalid clocks */
+	if (unlikely((u32) clock >= MAX_CLOCKS))
+		goto fallback;
+
+	/*
+	 * Convert the clockid to a bitmask and use it to check which
+	 * clocks are handled in the VDSO directly.
+	 */
+	msk = 1U << clock;
+	if (msk & VDSO_HRES) {
+		/*
+		 * Preserves the behaviour of posix_get_hrtimer_res().
+		 */
+		sec = 0;
+		ns = MONOTONIC_RES_NSEC;
+	} else if (msk & VDSO_COARSE) {
+		/*
+		 * Preserves the behaviour of posix_get_coarse_res().
+		 */
+		ns = LOW_RES_NSEC;
+		sec = __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	} else {
+		goto fallback;
+	}
+
+	if (res) {
+		res->tv_sec = sec;
+		res->tv_nsec = ns;
+	}
+
+	return 0;
+
+fallback:
+	return clock_getres_fallback(clock, res);
+}