diff mbox series

[v10,3/4] random: introduce generic vDSO getrandom() implementation

Message ID 20221129210639.42233-4-Jason@zx2c4.com (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series implement getrandom() in vDSO | expand

Commit Message

Jason A. Donenfeld Nov. 29, 2022, 9:06 p.m. UTC
Provide a generic C vDSO getrandom() implementation, which operates on
an opaque state returned by vgetrandom_alloc() and produces random bytes
the same way as getrandom(). This has a the API signature:

  ssize_t vgetrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state);

The return value and the first 3 arguments are the same as ordinary
getrandom(), while the last argument is a pointer to the opaque
allocated state. Were all four arguments passed to the getrandom()
syscall, nothing different would happen, and the functions would have
the exact same behavior.

The actual vDSO RNG algorithm implemented is the same one implemented by
drivers/char/random.c, using the same fast-erasure techniques as that.
Should the in-kernel implementation change, so too will the vDSO one.

It requires an implementation of ChaCha20 that does not use any stack,
in order to maintain forward secrecy if a multi-threaded program forks
(though this does not account for a similar issue with SA_SIGINFO
copying registers to the stack), so this is left as an
architecture-specific fill-in. Stack-less ChaCha20 is an easy algorithm
to implement on a variety of architectures, so this shouldn't be too
onerous.

Initially, the state is keyless, and so the first call makes a
getrandom() syscall to generate that key, and then uses it for
subsequent calls. By keeping track of a generation counter, it knows
when its key is invalidated and it should fetch a new one using the
syscall. Later, more than just a generation counter might be used.

Since MADV_WIPEONFORK is set on the opaque state, the key and related
state is wiped during a fork(), so secrets don't roll over into new
processes, and the same state doesn't accidentally generate the same
random stream. The generation counter, as well, is always >0, so that
the 0 counter is a useful indication of a fork() or otherwise
uninitialized state.

If the kernel RNG is not yet initialized, then the vDSO always calls the
syscall, because that behavior cannot be emulated in userspace, but
fortunately that state is short lived and only during early boot. If it
has been initialized, then there is no need to inspect the `flags`
argument, because the behavior does not change post-initialization
regardless of the `flags` value.

Since the opaque state passed to it is mutated, vDSO getrandom() is not
reentrant, when used with the same opaque state, which libc should be
mindful of.

vgetrandom_alloc() and vDSO getrandom() together provide the ability for
userspace to generate random bytes quickly and safely, and is intended
to be integrated into libc's thread management. As an illustrative
example, the following code might be used to do the same outside of
libc. All of the static functions are to be considered implementation
private, including the vgetrandom_alloc() syscall wrapper, which
generally shouldn't be exposed outside of libc, with the non-static
vgetrandom() function at the end being the exported interface. The
various pthread-isms are expected to be elided into libc internals. This
per-thread allocation scheme is very naive and does not shrink; other
implementations may choose to be more complex.

  static void *vgetrandom_alloc(unsigned int *num, unsigned int *size_per_each, unsigned int flags)
  {
    long ret = syscall(__NR_vgetrandom_alloc, &num, &size_per_each, flags);
    return ret == -1 ? NULL : (void *)ret;
  }

  static struct {
    pthread_mutex_t lock;
    void **states;
    size_t len, cap;
  } grnd_allocator = {
    .lock = PTHREAD_MUTEX_INITIALIZER
  };

  static void *vgetrandom_get_state(void)
  {
    void *state = NULL;

    pthread_mutex_lock(&grnd_allocator.lock);
    if (!grnd_allocator.len) {
      size_t new_cap;
      unsigned int size_per_each, num = 16; /* Just a hint. Could also be nr_cpus. */
      void *new_block = vgetrandom_alloc(&num, &size_per_each, 0), *new_states;

      if (!new_block)
        goto out;
      new_cap = grnd_allocator.cap + num;
      new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states));
      if (!new_states) {
        munmap(new_block, num * size_per_each);
        goto out;
      }
      grnd_allocator.cap = new_cap;
      grnd_allocator.states = new_states;

      for (size_t i = 0; i < num; ++i) {
        grnd_allocator.states[i] = new_block;
        new_block += size_per_each;
      }
      grnd_allocator.len = num;
    }
    state = grnd_allocator.states[--grnd_allocator.len];

  out:
    pthread_mutex_unlock(&grnd_allocator.lock);
    return state;
  }

  static void vgetrandom_put_state(void *state)
  {
    if (!state)
      return;
    pthread_mutex_lock(&grnd_allocator.lock);
    grnd_allocator.states[grnd_allocator.len++] = state;
    pthread_mutex_unlock(&grnd_allocator.lock);
  }

  static struct {
    ssize_t(*fn)(void *buf, size_t len, unsigned long flags, void *state);
    pthread_key_t key;
    pthread_once_t initialized;
  } grnd_ctx = {
    .initialized = PTHREAD_ONCE_INIT
  };

  static void vgetrandom_init(void)
  {
    if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0)
      return;
    grnd_ctx.fn = __vdsosym("LINUX_2.6", "__vdso_getrandom");
  }

  ssize_t vgetrandom(void *buf, size_t len, unsigned long flags)
  {
    void *state;

    pthread_once(&grnd_ctx.initialized, vgetrandom_init);
    if (!grnd_ctx.fn)
      return getrandom(buf, len, flags);
    state = pthread_getspecific(grnd_ctx.key);
    if (!state) {
      state = vgetrandom_get_state();
      if (pthread_setspecific(grnd_ctx.key, state) != 0) {
        vgetrandom_put_state(state);
        state = NULL;
      }
      if (!state)
        return getrandom(buf, len, flags);
    }
    return grnd_ctx.fn(buf, len, flags, state);
  }

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 MAINTAINERS             |   1 +
 drivers/char/random.c   |   9 ++
 include/vdso/datapage.h |  11 +++
 lib/vdso/Kconfig        |   7 +-
 lib/vdso/getrandom.c    | 204 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 lib/vdso/getrandom.c

Comments

Thomas Gleixner Nov. 29, 2022, 10:42 p.m. UTC | #1
Jason!

On Tue, Nov 29 2022 at 22:06, Jason A. Donenfeld wrote:
> +/**
> + * struct vdso_rng_data - vdso RNG state information
> + * @generation:	a counter representing the number of RNG reseeds

A counter

> + * @is_ready:	whether the RNG is initialized

Signals whether ...

> + */
> +struct vdso_rng_data {
> +	unsigned long	generation;
> +	bool		is_ready;
> +};
> +
> +
> +#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
> +	while (len >= sizeof(type)) { \
> +		__put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
> +		__put_unaligned_t(type, 0, src); \
> +		dst += sizeof(type); \
> +		src += sizeof(type); \
> +		len -= sizeof(type); \
> +	} \
> +} while (0)

I'd appreciate it if you go back to the code I suggested to you and
compare and contrast it in terms of readability.

> +
> +static void memcpy_and_zero_src(void *dst, void *src, size_t len)
> +{
> +	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
> +		if (IS_ENABLED(CONFIG_64BIT))
> +			MEMCPY_AND_ZERO_SRC(u64, dst, src, len);
> +		MEMCPY_AND_ZERO_SRC(u32, dst, src, len);
> +		MEMCPY_AND_ZERO_SRC(u16, dst, src, len);
> +	}
> +	MEMCPY_AND_ZERO_SRC(u8, dst, src, len);
> +}
> +
> +/**
> + * __cvdso_getrandom_data - generic vDSO implementation of getrandom() syscall
> + * @rng_info:		describes state of kernel RNG, memory shared with kernel
> + * @buffer:		destination buffer to fill with random bytes
> + * @len:		size of @buffer in bytes
> + * @flags:		zero or more GRND_* flags
> + * @opaque_state:	a pointer to an opaque state area

NIT. Please start the explanations with an uppercase letter

> +		/*
> +		 * Set @state->pos to beyond the end of the batch, so that the batch is refilled
> +		 * using the new key.
> +		 */
> +		state->pos = sizeof(state->batch);
> +	}
> +

This one is odd:

> +	len = ret;

@ret is not modified after the initialization at the top of the
function:

> +	ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);

so I really had to go up a page and figure out what the story is.

> +
> +	/* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
> +	state->pos = 0;
> +	goto more_batch;

Aside of the nitpicks above, thank you very much for making this
comprehensible.

The comments are well done and appreciated and I'm pretty sure that this
part:

> +	in_use = READ_ONCE(state->in_use);
> +	if (unlikely(in_use))
> +		goto fallback_syscall;
> +	WRITE_ONCE(state->in_use, true);

was very much induced by writing those comments :)

Thanks,

        tglx
Jason A. Donenfeld Nov. 30, 2022, 1:09 a.m. UTC | #2
Hi Thomas,

On Tue, Nov 29, 2022 at 11:42:16PM +0100, Thomas Gleixner wrote:
> Jason!
> 
> On Tue, Nov 29 2022 at 22:06, Jason A. Donenfeld wrote:
> > +/**
> > + * struct vdso_rng_data - vdso RNG state information
> > + * @generation:	a counter representing the number of RNG reseeds

FYI, ever other struct in this file uses lower case and no punctuation,
so I'll follow that for this one.

> A counter
> 
> > + * @is_ready:	whether the RNG is initialized
> 
> Signals whether ...

Ack.

> > + */
> > +struct vdso_rng_data {
> > +	unsigned long	generation;
> > +	bool		is_ready;
> > +};
> > +
> > +
> > +#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
> > +	while (len >= sizeof(type)) { \
> > +		__put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
> > +		__put_unaligned_t(type, 0, src); \
> > +		dst += sizeof(type); \
> > +		src += sizeof(type); \
> > +		len -= sizeof(type); \
> > +	} \
> > +} while (0)
> 
> I'd appreciate it if you go back to the code I suggested to you and
> compare and contrast it in terms of readability.

Ahh, you like to align your \. Okay, I'll do that. I also added a do {
... } while (0) wrapper around it, but I think it makes sense to keep
that so that there aren't stray semicolons.

> > +/**
> > + * __cvdso_getrandom_data - generic vDSO implementation of getrandom() syscall
> > + * @rng_info:		describes state of kernel RNG, memory shared with kernel
> > + * @buffer:		destination buffer to fill with random bytes
> > + * @len:		size of @buffer in bytes
> > + * @flags:		zero or more GRND_* flags
> > + * @opaque_state:	a pointer to an opaque state area
> 
> NIT. Please start the explanations with an uppercase letter

Okay. Will do everywhere in this patchset except for in vdso/datapage.h.

 
> This one is odd:
> 
> > +	len = ret;
> 
> @ret is not modified after the initialization at the top of the
> function:
> 
> > +	ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
> 
> so I really had to go up a page and figure out what the story is.

If the generation changes, and it's tried again, the whole random buffer
is filled again, so that has to be reset. I'll leave a comment.

> > +
> > +	/* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
> > +	state->pos = 0;
> > +	goto more_batch;
> 
> Aside of the nitpicks above, thank you very much for making this
> comprehensible.

Thanks for nudging me in the right direction.

> 
> The comments are well done and appreciated and I'm pretty sure that this
> part:
> 
> > +	in_use = READ_ONCE(state->in_use);
> > +	if (unlikely(in_use))
> > +		goto fallback_syscall;
> > +	WRITE_ONCE(state->in_use, true);
> 
> was very much induced by writing those comments :)

Well, not exactly, unfortunately. Adhemerval -- the glibc maintainer
working on the libc side of this -- and I have been discussing signal
handling craziness and lots of different schemes over the last week+,
and this rather simple thing is the result of those conversations.

Jason
Florian Weimer Nov. 30, 2022, 10:44 a.m. UTC | #3
* Jason A. Donenfeld:

> diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
> index 73eb622e7663..9ae4d76b36c7 100644
> --- a/include/vdso/datapage.h
> +++ b/include/vdso/datapage.h
> @@ -109,6 +109,16 @@ struct vdso_data {
>  	struct arch_vdso_data	arch_data;
>  };
>  
> +/**
> + * struct vdso_rng_data - vdso RNG state information
> + * @generation:	a counter representing the number of RNG reseeds
> + * @is_ready:	whether the RNG is initialized
> + */
> +struct vdso_rng_data {
> +	unsigned long	generation;
> +	bool		is_ready;
> +};
> +

I don't think you can use a type like long here.  The header says this:

 * vdso_data will be accessed by 64 bit and compat code at the same time
 * so we should be careful before modifying this structure.

So the ABI must be same for 32-bit and 64-bit mode, and long isn't.

Thanks,
Florian
Jason A. Donenfeld Nov. 30, 2022, 2:51 p.m. UTC | #4
Hi Florian,

On Wed, Nov 30, 2022 at 11:44:30AM +0100, Florian Weimer wrote:
> * Jason A. Donenfeld:
> 
> > diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
> > index 73eb622e7663..9ae4d76b36c7 100644
> > --- a/include/vdso/datapage.h
> > +++ b/include/vdso/datapage.h
> > @@ -109,6 +109,16 @@ struct vdso_data {
> >  	struct arch_vdso_data	arch_data;
> >  };
> >  
> > +/**
> > + * struct vdso_rng_data - vdso RNG state information
> > + * @generation:	a counter representing the number of RNG reseeds
> > + * @is_ready:	whether the RNG is initialized
> > + */
> > +struct vdso_rng_data {
> > +	unsigned long	generation;
> > +	bool		is_ready;
> > +};
> > +
> 
> I don't think you can use a type like long here.  The header says this:
> 
>  * vdso_data will be accessed by 64 bit and compat code at the same time
>  * so we should be careful before modifying this structure.
> 
> So the ABI must be same for 32-bit and 64-bit mode, and long isn't.

Excellent point. The size of the type needs to be explicit. Will fix.

Jason
Jason A. Donenfeld Nov. 30, 2022, 2:59 p.m. UTC | #5
On Wed, Nov 30, 2022 at 03:51:01PM +0100, Jason A. Donenfeld wrote:
> Hi Florian,
> 
> On Wed, Nov 30, 2022 at 11:44:30AM +0100, Florian Weimer wrote:
> > * Jason A. Donenfeld:
> > 
> > > diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
> > > index 73eb622e7663..9ae4d76b36c7 100644
> > > --- a/include/vdso/datapage.h
> > > +++ b/include/vdso/datapage.h
> > > @@ -109,6 +109,16 @@ struct vdso_data {
> > >  	struct arch_vdso_data	arch_data;
> > >  };
> > >  
> > > +/**
> > > + * struct vdso_rng_data - vdso RNG state information
> > > + * @generation:	a counter representing the number of RNG reseeds
> > > + * @is_ready:	whether the RNG is initialized
> > > + */
> > > +struct vdso_rng_data {
> > > +	unsigned long	generation;
> > > +	bool		is_ready;
> > > +};
> > > +
> > 
> > I don't think you can use a type like long here.  The header says this:
> > 
> >  * vdso_data will be accessed by 64 bit and compat code at the same time
> >  * so we should be careful before modifying this structure.
> > 
> > So the ABI must be same for 32-bit and 64-bit mode, and long isn't.
> 
> Excellent point. The size of the type needs to be explicit. Will fix.

I'll do something like this:

diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index 80e25cdcdb1c..218bbeac5613 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -19,6 +19,19 @@
 #include <vdso/time32.h>
 #include <vdso/time64.h>

+/**
+ * type vdso_kernel_ulong - unsigned long type that matches kernel's unsigned long
+ *
+ * The structs in this file must operate the same way in both 64-bit code and
+ * in 32-bit compat code, over the same potentially 64-bit kernel. So, this
+ * type represents the size of an unsigned long as used by kernel-space code.
+ */
+#ifdef CONFIG_64BIT
+typedef u64 vdso_kernel_ulong;
+#else
+typedef u32 vdso_kernel_ulong;
+#endif
+
 #ifdef CONFIG_ARCH_HAS_VDSO_DATA
 #include <asm/vdso/data.h>
 #else
@@ -115,8 +128,8 @@ struct vdso_data {
  * @is_ready:	signals whether the RNG is initialized
  */
 struct vdso_rng_data {
-	unsigned long	generation;
-	bool		is_ready;
+	vdso_kernel_ulong	generation;
+	bool			is_ready;
 };

 /*
Arnd Bergmann Nov. 30, 2022, 3:07 p.m. UTC | #6
On Wed, Nov 30, 2022, at 15:59, Jason A. Donenfeld wrote:
> On Wed, Nov 30, 2022 at 03:51:01PM +0100, Jason A. Donenfeld wrote:
>> On Wed, Nov 30, 2022 at 11:44:30AM +0100, Florian Weimer wrote:
>> > 
>> >  * vdso_data will be accessed by 64 bit and compat code at the same time
>> >  * so we should be careful before modifying this structure.
>> > 
>> > So the ABI must be same for 32-bit and 64-bit mode, and long isn't.

> I'll do something like this:
>
> 
> +#ifdef CONFIG_64BIT
> +typedef u64 vdso_kernel_ulong;
> +#else
> +typedef u32 vdso_kernel_ulong;
> +#endif

This does not address the ABI concern: to allow 32-bit and 64-bit
tasks to share the same data page, it has to be the same width on
both, either u32 or 64, but not depending on a configuration
option.

> struct vdso_rng_data {
>	vdso_kernel_ulong	generation;
>	bool			is_ready;
> };

There is another problem with this: you have implicit padding
in the structure because the two members have different size
and alignment requirements. The easiest fix is to make them
both u64, or you could have a u32 is_ready and an explit u32
for the padding.

      Arnd
Jason A. Donenfeld Nov. 30, 2022, 3:12 p.m. UTC | #7
Hi Arnd,

On Wed, Nov 30, 2022 at 4:07 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > +#ifdef CONFIG_64BIT
> > +typedef u64 vdso_kernel_ulong;
> > +#else
> > +typedef u32 vdso_kernel_ulong;
> > +#endif
>
> This does not address the ABI concern: to allow 32-bit and 64-bit
> tasks to share the same data page, it has to be the same width on
> both, either u32 or 64, but not depending on a configuration
> option.

I think it does address the issue. CONFIG_64BIT is a .config setting,
not a compiler-derived setting. So a 64-bit kernel will get a u64 in
kernel mode, and then it will get a u64 for the 64-bit vdso usermode
compile, and finally it will get a u64 for the 32-bit vdso usermode
compile. So in all three cases, the size is the same.

> > struct vdso_rng_data {
> >       vdso_kernel_ulong       generation;
> >       bool                    is_ready;
> > };
>
> There is another problem with this: you have implicit padding
> in the structure because the two members have different size
> and alignment requirements. The easiest fix is to make them
> both u64, or you could have a u32 is_ready and an explit u32
> for the padding.

There's padding at the end of the structure, yes. But both
`generation` and `is_ready` will be at the same offset. If the
structure grows, then sure, that'll have to be taken into account. But
that's not a problem because this is a private implementation detail
between the vdso code and the kernel.

Jason
Arnd Bergmann Nov. 30, 2022, 3:29 p.m. UTC | #8
On Wed, Nov 30, 2022, at 16:12, Jason A. Donenfeld wrote:
> Hi Arnd,
>
> On Wed, Nov 30, 2022 at 4:07 PM Arnd Bergmann <arnd@arndb.de> wrote:
>> > +#ifdef CONFIG_64BIT
>> > +typedef u64 vdso_kernel_ulong;
>> > +#else
>> > +typedef u32 vdso_kernel_ulong;
>> > +#endif
>>
>> This does not address the ABI concern: to allow 32-bit and 64-bit
>> tasks to share the same data page, it has to be the same width on
>> both, either u32 or 64, but not depending on a configuration
>> option.
>
> I think it does address the issue. CONFIG_64BIT is a .config setting,
> not a compiler-derived setting. So a 64-bit kernel will get a u64 in
> kernel mode, and then it will get a u64 for the 64-bit vdso usermode
> compile, and finally it will get a u64 for the 32-bit vdso usermode
> compile. So in all three cases, the size is the same.

I see what you mean now. However this means your vdso32 copies
are different between 32-bit and 64-bit kernels. If you need to
access one of the fields from assembler, it even ends up
different at source level, which adds a bit of complexity.

Making the interface configuration-independent makes it obvious
to the reader that none of these problems can happen.

>> > struct vdso_rng_data {
>> >       vdso_kernel_ulong       generation;
>> >       bool                    is_ready;
>> > };
>>
>> There is another problem with this: you have implicit padding
>> in the structure because the two members have different size
>> and alignment requirements. The easiest fix is to make them
>> both u64, or you could have a u32 is_ready and an explit u32
>> for the padding.
>
> There's padding at the end of the structure, yes. But both
> `generation` and `is_ready` will be at the same offset. If the
> structure grows, then sure, that'll have to be taken into account. But
> that's not a problem because this is a private implementation detail
> between the vdso code and the kernel.

I was not concerned about incompatibility here, but rather about
possibly leaking kernel data to the vdso page. Again, this probably
doesn't happen if your code is written correctly, but the rule for
kernel-user ABIs is to avoid implicit padding to ensure that
the padding bytes can never leak any information. Using structures
without padding at the minimum helps avoid having to think about
whether this can become a problem when inspecting the code for
possible issues, both from humans and from automated tools.

     Arnd
Jason A. Donenfeld Nov. 30, 2022, 3:47 p.m. UTC | #9
Hi Arnd,

On Wed, Nov 30, 2022 at 4:29 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > I think it does address the issue. CONFIG_64BIT is a .config setting,
> > not a compiler-derived setting. So a 64-bit kernel will get a u64 in
> > kernel mode, and then it will get a u64 for the 64-bit vdso usermode
> > compile, and finally it will get a u64 for the 32-bit vdso usermode
> > compile. So in all three cases, the size is the same.
>
> I see what you mean now. However this means your vdso32 copies
> are different between 32-bit and 64-bit kernels. If you need to
> access one of the fields from assembler, it even ends up
> different at source level, which adds a bit of complexity.
>
> Making the interface configuration-independent makes it obvious
> to the reader that none of these problems can happen.

Except ideally, these are word-sized accesses (where only compat code
has to suffer I suppose).

> >> > struct vdso_rng_data {
> >> >       vdso_kernel_ulong       generation;
> >> >       bool                    is_ready;
> >> > };
> >>
> >> There is another problem with this: you have implicit padding
> >> in the structure because the two members have different size
> >> and alignment requirements. The easiest fix is to make them
> >> both u64, or you could have a u32 is_ready and an explit u32
> >> for the padding.
> >
> > There's padding at the end of the structure, yes. But both
> > `generation` and `is_ready` will be at the same offset. If the
> > structure grows, then sure, that'll have to be taken into account. But
> > that's not a problem because this is a private implementation detail
> > between the vdso code and the kernel.
>
> I was not concerned about incompatibility here, but rather about
> possibly leaking kernel data to the vdso page.

The vvar page starts out zeroed, no?

Jason
Arnd Bergmann Nov. 30, 2022, 4:13 p.m. UTC | #10
On Wed, Nov 30, 2022, at 16:47, Jason A. Donenfeld wrote:

>> > There's padding at the end of the structure, yes. But both
>> > `generation` and `is_ready` will be at the same offset. If the
>> > structure grows, then sure, that'll have to be taken into account. But
>> > that's not a problem because this is a private implementation detail
>> > between the vdso code and the kernel.
>>
>> I was not concerned about incompatibility here, but rather about
>> possibly leaking kernel data to the vdso page.
>
> The vvar page starts out zeroed, no?

The typical problem is someone doing a copy_to_user() of an in-kernel
structure into the userspace side, which would then copy the
padding as well. If the source is on the stack, a malicious caller
can trick the another syscall into leaving sensitive data at this
exact stack location. Again, I'm not saying that your code is
vulnerable to that type of attack, just that making all ABI
structures not have holes is useful for auditing.

    Arnd
Jason A. Donenfeld Nov. 30, 2022, 4:40 p.m. UTC | #11
On Wed, Nov 30, 2022 at 05:13:18PM +0100, Arnd Bergmann wrote:
> On Wed, Nov 30, 2022, at 16:47, Jason A. Donenfeld wrote:
> 
> >> > There's padding at the end of the structure, yes. But both
> >> > `generation` and `is_ready` will be at the same offset. If the
> >> > structure grows, then sure, that'll have to be taken into account. But
> >> > that's not a problem because this is a private implementation detail
> >> > between the vdso code and the kernel.
> >>
> >> I was not concerned about incompatibility here, but rather about
> >> possibly leaking kernel data to the vdso page.
> >
> > The vvar page starts out zeroed, no?
> 
> The typical problem is someone doing a copy_to_user() of an in-kernel
> structure into the userspace side, which would then copy the
> padding as well. If the source is on the stack, a malicious caller
> can trick the another syscall into leaving sensitive data at this
> exact stack location.

I'm quite aware of this infoleak, having made use of it countless times
over the years. It just doesn't seem relevant to the vvar page.

Jason
Thomas Gleixner Nov. 30, 2022, 5 p.m. UTC | #12
On Wed, Nov 30 2022 at 16:47, Jason A. Donenfeld wrote:
> On Wed, Nov 30, 2022 at 4:29 PM Arnd Bergmann <arnd@arndb.de> wrote:
>> I see what you mean now. However this means your vdso32 copies
>> are different between 32-bit and 64-bit kernels. If you need to
>> access one of the fields from assembler, it even ends up
>> different at source level, which adds a bit of complexity.
>>
>> Making the interface configuration-independent makes it obvious
>> to the reader that none of these problems can happen.
>
> Except ideally, these are word-sized accesses (where only compat code
> has to suffer I suppose).

While I hate it with a passion, there is actually a valid reason to use
this ugly typedef.

On 32bit architectures which have load/store tearing of 64bit variables
into two 32bit accesses due to ISA limitations, that results in
undefined behaviour when write and read are concurrent. Neither
READ_ONCE() nor WRITE_ONCE help there.

Though that begs the question whether we need a 64bit generation counter
for the VDSO at all.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 3894f947a507..70dff39fcff9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17288,6 +17288,7 @@  S:	Maintained
 F:	drivers/char/random.c
 F:	drivers/virt/vmgenid.c
 F:	include/vdso/getrandom.h
+F:	lib/vdso/getrandom.c
 
 RAPIDIO SUBSYSTEM
 M:	Matt Porter <mporter@kernel.crashing.org>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index b81d67f3ebab..b37a3f9367a9 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -60,6 +60,9 @@ 
 #ifdef CONFIG_VGETRANDOM_ALLOC_SYSCALL
 #include <vdso/getrandom.h>
 #endif
+#ifdef CONFIG_VDSO_GETRANDOM
+#include <vdso/datapage.h>
+#endif
 #include <asm/processor.h>
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
@@ -344,6 +347,9 @@  static void crng_reseed(struct work_struct *work)
 	if (next_gen == ULONG_MAX)
 		++next_gen;
 	WRITE_ONCE(base_crng.generation, next_gen);
+#ifdef CONFIG_VDSO_GETRANDOM
+	smp_store_release(&_vdso_rng_data.generation, next_gen + 1);
+#endif
 	if (!static_branch_likely(&crng_is_ready))
 		crng_init = CRNG_READY;
 	spin_unlock_irqrestore(&base_crng.lock, flags);
@@ -794,6 +800,9 @@  static void __cold _credit_init_bits(size_t bits)
 		if (static_key_initialized)
 			execute_in_process_context(crng_set_ready, &set_ready);
 		atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
+#ifdef CONFIG_VDSO_GETRANDOM
+		smp_store_release(&_vdso_rng_data.is_ready, true);
+#endif
 		wake_up_interruptible(&crng_init_wait);
 		kill_fasync(&fasync, SIGIO, POLL_IN);
 		pr_notice("crng init done\n");
diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index 73eb622e7663..9ae4d76b36c7 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -109,6 +109,16 @@  struct vdso_data {
 	struct arch_vdso_data	arch_data;
 };
 
+/**
+ * struct vdso_rng_data - vdso RNG state information
+ * @generation:	a counter representing the number of RNG reseeds
+ * @is_ready:	whether the RNG is initialized
+ */
+struct vdso_rng_data {
+	unsigned long	generation;
+	bool		is_ready;
+};
+
 /*
  * We use the hidden visibility to prevent the compiler from generating a GOT
  * relocation. Not only is going through a GOT useless (the entry couldn't and
@@ -120,6 +130,7 @@  struct vdso_data {
  */
 extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden")));
 extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden")));
+extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden")));
 
 /*
  * The generic vDSO implementation requires that gettimeofday.h
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index b22584f8da03..f12b76642921 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -29,7 +29,6 @@  config GENERIC_VDSO_TIME_NS
 	help
 	  Selected by architectures which support time namespaces in the
 	  VDSO
-
 endif
 
 config VGETRANDOM_ALLOC_SYSCALL
@@ -38,3 +37,9 @@  config VGETRANDOM_ALLOC_SYSCALL
 	help
 	  Selected by the getrandom() vDSO function, which requires this
 	  for state allocation.
+
+config VDSO_GETRANDOM
+	bool
+	select VGETRANDOM_ALLOC_SYSCALL
+	help
+	  Selected by architectures that support vDSO getrandom().
diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c
new file mode 100644
index 000000000000..1c51e24a7f24
--- /dev/null
+++ b/lib/vdso/getrandom.c
@@ -0,0 +1,204 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <vdso/datapage.h>
+#include <vdso/getrandom.h>
+#include <asm/vdso/getrandom.h>
+#include <asm/vdso/vsyscall.h>
+
+#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
+	while (len >= sizeof(type)) { \
+		__put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
+		__put_unaligned_t(type, 0, src); \
+		dst += sizeof(type); \
+		src += sizeof(type); \
+		len -= sizeof(type); \
+	} \
+} while (0)
+
+static void memcpy_and_zero_src(void *dst, void *src, size_t len)
+{
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+		if (IS_ENABLED(CONFIG_64BIT))
+			MEMCPY_AND_ZERO_SRC(u64, dst, src, len);
+		MEMCPY_AND_ZERO_SRC(u32, dst, src, len);
+		MEMCPY_AND_ZERO_SRC(u16, dst, src, len);
+	}
+	MEMCPY_AND_ZERO_SRC(u8, dst, src, len);
+}
+
+/**
+ * __cvdso_getrandom_data - generic vDSO implementation of getrandom() syscall
+ * @rng_info:		describes state of kernel RNG, memory shared with kernel
+ * @buffer:		destination buffer to fill with random bytes
+ * @len:		size of @buffer in bytes
+ * @flags:		zero or more GRND_* flags
+ * @opaque_state:	a pointer to an opaque state area
+ *
+ * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
+ * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
+ * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
+ * calls into the syscall.
+ *
+ * @opaque_state *must* be allocated using the vgetrandom_alloc() syscall.  Unless external locking
+ * is used, one state must be allocated per thread, as it is not safe to call this function
+ * concurrently with the same @opaque_state. However, it is safe to call this using the same
+ * @opaque_state that is shared between main code and signal handling code, within the same thread.
+ *
+ * Returns the number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t
+__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len,
+		       unsigned int flags, void *opaque_state)
+{
+	ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
+	struct vgetrandom_state *state = opaque_state;
+	size_t batch_len, nblocks, orig_len = len;
+	unsigned long current_generation;
+	void *orig_buffer = buffer;
+	u32 counter[2] = { 0 };
+	bool in_use;
+
+	/*
+	 * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from
+	 * userspace, because A) the various @flags require this to block, or not, depending on
+	 * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is
+	 * ready is to reseed from the entropy pool at every invocation.
+	 */
+	if (unlikely(!READ_ONCE(rng_info->is_ready)))
+		goto fallback_syscall;
+
+	/*
+	 * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is
+	 * initialized, the @flags parameter may require this to block or return an error, even when
+	 * len is zero.
+	 */
+	if (unlikely(!len))
+		return 0;
+
+	/*
+	 * @state->in_use is basic reentrancy protection against this running in a signal handler
+	 * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one
+	 * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before
+	 * writing @state->in_use, there is still no race, because the signal handler will run to
+	 * its completion before returning execution.
+	 */
+	in_use = READ_ONCE(state->in_use);
+	if (unlikely(in_use))
+		goto fallback_syscall;
+	WRITE_ONCE(state->in_use, true);
+
+retry_generation:
+	/*
+	 * @rng_info->generation must always be read here, as it serializes @state->key with the
+	 * kernel's RNG reseeding schedule.
+	 */
+	current_generation = READ_ONCE(rng_info->generation);
+
+	/*
+	 * If @state->generation doesn't match the kernel RNG's generation, then it means the
+	 * kernel's RNG has reseeded, and so @state->key is reseeded as well.
+	 */
+	if (unlikely(state->generation != current_generation)) {
+		/*
+		 * Write the generation before filling the key, in case of fork. If there is a fork
+		 * just after this line, the two forks will get different random bytes from the
+		 * syscall, which is good. However, were this line to occur after the getrandom
+		 * syscall, then both child and parent could have the same bytes and the same
+		 * generation counter, so the fork would not be detected. Therefore, write
+		 * @state->generation before the call to the getrandom syscall.
+		 */
+		WRITE_ONCE(state->generation, current_generation);
+
+		/* Reseed @state->key using fresh bytes from the kernel. */
+		if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) {
+			/*
+			 * If the syscall failed to refresh the key, then @state->key is now
+			 * invalid, so invalidate the generation so that it is not used again, and
+			 * fallback to using the syscall entirely.
+			 */
+			WRITE_ONCE(state->generation, 0);
+
+			/*
+			 * Set @state->in_use to false only after the last write to @state in the
+			 * line above.
+			 */
+			WRITE_ONCE(state->in_use, false);
+
+			goto fallback_syscall;
+		}
+
+		/*
+		 * Set @state->pos to beyond the end of the batch, so that the batch is refilled
+		 * using the new key.
+		 */
+		state->pos = sizeof(state->batch);
+	}
+
+	len = ret;
+more_batch:
+	/*
+	 * First use bytes out of @state->batch, which may have been filled by the last call to this
+	 * function.
+	 */
+	batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len);
+	if (batch_len) {
+		/* Zeroing at the same time as memcpying helps preserve forward secrecy. */
+		memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len);
+		state->pos += batch_len;
+		buffer += batch_len;
+		len -= batch_len;
+	}
+
+	if (!len) {
+		/*
+		 * Since @rng_info->generation will never be 0, re-read @state->generation, rather
+		 * than using the local current_generation variable, to learn whether a fork
+		 * occurred. Primarily, though, this indicates whether the kernel's RNG has
+		 * reseeded, in which case generate a new key and start over.
+		 */
+		if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) {
+			buffer = orig_buffer;
+			goto retry_generation;
+		}
+
+		/*
+		 * Set @state->in_use to false only when there will be no more reads or writes of
+		 * @state.
+		 */
+		WRITE_ONCE(state->in_use, false);
+		return ret;
+	}
+
+	/* Generate blocks of RNG output directly into @buffer while there's enough room left. */
+	nblocks = len / CHACHA_BLOCK_SIZE;
+	if (nblocks) {
+		__arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks);
+		buffer += nblocks * CHACHA_BLOCK_SIZE;
+		len -= nblocks * CHACHA_BLOCK_SIZE;
+	}
+
+	BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0);
+
+	/* Refill the batch and then overwrite the key, in order to preserve forward secrecy. */
+	__arch_chacha20_blocks_nostack(state->batch_key, state->key, counter,
+				       sizeof(state->batch_key) / CHACHA_BLOCK_SIZE);
+
+	/* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
+	state->pos = 0;
+	goto more_batch;
+
+fallback_syscall:
+	return getrandom_syscall(orig_buffer, orig_len, flags);
+}
+
+static __always_inline ssize_t
+__cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state)
+{
+	return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state);
+}