diff mbox series

[v7,3/3] x86: vdso: Wire up getrandom() vDSO implementation

Message ID 20221124165536.1631325-4-Jason@zx2c4.com (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series implement getrandom() in vDSO | expand

Commit Message

Jason A. Donenfeld Nov. 24, 2022, 4:55 p.m. UTC
Hook up the generic vDSO implementation to the x86 vDSO data page. Since
the existing vDSO infrastructure is heavily based on the timekeeping
functionality, which works over arrays of bases, a new macro is
introduced for vvars that are not arrays.

The vDSO function requires a ChaCha20 implementation that does not write
to the stack, yet can still do an entire ChaCha20 permutation, so
provide this using SSE2, since this is userland code that must work on
all x86-64 processors.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/x86/Kconfig                        |   1 +
 arch/x86/entry/vdso/Makefile            |   3 +-
 arch/x86/entry/vdso/vdso.lds.S          |   2 +
 arch/x86/entry/vdso/vgetrandom-chacha.S | 179 ++++++++++++++++++++++++
 arch/x86/entry/vdso/vgetrandom.c        |  18 +++
 arch/x86/include/asm/vdso/getrandom.h   |  49 +++++++
 arch/x86/include/asm/vdso/vsyscall.h    |   2 +
 arch/x86/include/asm/vvar.h             |  16 +++
 8 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/entry/vdso/vgetrandom-chacha.S
 create mode 100644 arch/x86/entry/vdso/vgetrandom.c
 create mode 100644 arch/x86/include/asm/vdso/getrandom.h

Comments

Thomas Gleixner Nov. 25, 2022, 11:08 p.m. UTC | #1
Jason!

On Thu, Nov 24 2022 at 17:55, Jason A. Donenfeld wrote:
> +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> +/*
> + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> + * counter. Importantly does not spill to the stack. Its arguments are:

Basic or not. This needs a Reviewed-by from someone who understands SSE2
and ChaCha20 before this can go anywhere near the x86 tree.

> +++ b/arch/x86/entry/vdso/vgetrandom.c
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +#include <linux/kernel.h>

Why do you need kernel.h here?

> +#include <linux/types.h>
> +
> +#include "../../../../lib/vdso/getrandom.c"
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state);
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state)
> +{
> +	return __cvdso_getrandom(buffer, len, flags, state);
> +}
> +
> +ssize_t getrandom(void *, size_t, unsigned int, void *)
> +	__attribute__((weak, alias("__vdso_getrandom")));
> diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
> new file mode 100644
> index 000000000000..099aca58ef20
> --- /dev/null
> +++ b/arch/x86/include/asm/vdso/getrandom.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +#ifndef __ASM_VDSO_GETRANDOM_H
> +#define __ASM_VDSO_GETRANDOM_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <asm/unistd.h>
> +#include <asm/vvar.h>
> +
> +static __always_inline ssize_t
> +getrandom_syscall(void *buffer, size_t len, unsigned int flags)

static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)

please. We expanded to 100 quite some time ago.

Some kernel-doc compliant comment for this would be appreciated as well.

> +{
> +	long ret;
> +
> +	asm ("syscall" : "=a" (ret) :
> +	     "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
> +	     "rcx", "r11", "memory");
> +
> +	return ret;
> +}
> +
> +#define __vdso_rng_data (VVAR(_vdso_rng_data))
> +
> +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
> +{
> +	if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
> +		return (void *)&__vdso_rng_data +
> +		       ((void *)&__timens_vdso_data - (void *)&__vdso_data);
> +	return &__vdso_rng_data;

So either bite the bullet and  write it:

	if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
		return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data);

        return &__vdso_rng_data;

or comply to the well documented rules of the tip tree:

   https://www.kernel.org/doc/html/latest/process/maintainer-tip.html#bracket-rules

> +/*
> + * Generates a given positive number of block of ChaCha20 output with nonce=0,
> + * and does not write to any stack or memory outside of the parameters passed
> + * to it. This way, we don't need to worry about stack data leaking into forked
> + * child processes.

Please use proper kernel-doc

> + */
> +static __always_inline void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks)
> +{
> +	extern void chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
> +	return chacha20_blocks_nostack(dst_bytes, key, counter, nblocks);

You surely have an issue with your newline key...

The above aside, can you please explain the value of this __arch_()
wrapper?

It's just voodoo for no value because it hands through the arguments
1:1. So where are you expecting that that __arch...() version of this is
any different than invoking the architecture specific version of
chacha20_blocks_nostack().

Can you spot the irony of your naming choices?

    __arch_chacha20_blocks_nostack() {
      	return chacha20_blocks_nostack()
    };

Thanks,

        tglx
Jason A. Donenfeld Nov. 27, 2022, 10:07 p.m. UTC | #2
Hi Thomas,

On Sat, Nov 26, 2022 at 12:08:41AM +0100, Thomas Gleixner wrote:
> Jason!
> 
> On Thu, Nov 24 2022 at 17:55, Jason A. Donenfeld wrote:
> > +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> > +/*
> > + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> > + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> > + * counter. Importantly does not spill to the stack. Its arguments are:
> 
> Basic or not.

Heh, FYI I didn't mean "basic" here as in "doesn't need a review", but
just that it's a straightforward technique and doesn't do any
complicated multiblock pyrotechnics (which frankly aren't really
needed).

> This needs a Reviewed-by from someone who understands SSE2
> and ChaCha20 before this can go anywhere near the x86 tree.

No problem. I'll see to it that somebody qualified gives this a review.

> > +#include <linux/kernel.h>
> 
> Why do you need kernel.h here?

Turns out I don't, thanks.

> > +static __always_inline ssize_t
> > +getrandom_syscall(void *buffer, size_t len, unsigned int flags)
> 
> static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
> 
> please. We expanded to 100 quite some time ago.
> 
> Some kernel-doc compliant comment for this would be appreciated as well.

Will do.

> 
> > +{
> > +	long ret;
> > +
> > +	asm ("syscall" : "=a" (ret) :
> > +	     "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
> > +	     "rcx", "r11", "memory");
> > +
> > +	return ret;
> > +}
> > +
> > +#define __vdso_rng_data (VVAR(_vdso_rng_data))
> > +
> > +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
> > +{
> > +	if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
> > +		return (void *)&__vdso_rng_data +
> > +		       ((void *)&__timens_vdso_data - (void *)&__vdso_data);
> > +	return &__vdso_rng_data;
> 
> So either bite the bullet and  write it:
> 
> 	if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
> 		return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data);

Seems fine to me. I'll write it like that.

> > +/*
> > + * Generates a given positive number of block of ChaCha20 output with nonce=0,
> > + * and does not write to any stack or memory outside of the parameters passed
> > + * to it. This way, we don't need to worry about stack data leaking into forked
> > + * child processes.
> 
> Please use proper kernel-doc
> 
> > + */
> > +static __always_inline void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks)
> > +{
> > +	extern void chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
> > +	return chacha20_blocks_nostack(dst_bytes, key, counter, nblocks);
> 
> The above aside, can you please explain the value of this __arch_()
> wrapper?
> 
> It's just voodoo for no value because it hands through the arguments
> 1:1. So where are you expecting that that __arch...() version of this is
> any different than invoking the architecture specific version of
> chacha20_blocks_nostack().

I'll just name the assembly function with __arch...(). The idea behind
the wrapper was just to keep all of the non-generic code called from the
generic code prefixed with __arch_, but there's no reason I need to name
it like that from C alone. Will fix for v8.

Thanks again,
Jason
Samuel Neves Nov. 27, 2022, 10:39 p.m. UTC | #3
On Sun, Nov 27, 2022 at 10:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Hi Thomas,
>
> On Sat, Nov 26, 2022 at 12:08:41AM +0100, Thomas Gleixner wrote:
> > Jason!
> >
> > On Thu, Nov 24 2022 at 17:55, Jason A. Donenfeld wrote:
> > > +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> > > +/*
> > > + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> > > + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> > > + * counter. Importantly does not spill to the stack. Its arguments are:
> >
> > Basic or not.
>
> Heh, FYI I didn't mean "basic" here as in "doesn't need a review", but
> just that it's a straightforward technique and doesn't do any
> complicated multiblock pyrotechnics (which frankly aren't really
> needed).
>
> > This needs a Reviewed-by from someone who understands SSE2
> > and ChaCha20 before this can go anywhere near the x86 tree.
>
> No problem. I'll see to it that somebody qualified gives this a review.
>

I did look at this earlier. It looks fine. I would recommend changing

+ /* copy1,copy2 = key */
+ movdqu 0x00(key),copy1
+ movdqu 0x10(key),copy2

to

+ /* copy1,copy2 = key */
+ movups 0x00(key),copy1
+ movups 0x10(key),copy2

which has the same semantics, but saves a couple of code bytes. Likewise for

+ movdqu state0,0x00(output)
+ movdqu state1,0x10(output)
+ movdqu state2,0x20(output)
+ movdqu state3,0x30(output)

Otherwise,

Reviewed-by: Samuel Neves <sneves@dei.uc.pt> # for vgetrandom-chacha.S
Jason A. Donenfeld Nov. 28, 2022, 12:19 a.m. UTC | #4
On Sun, Nov 27, 2022 at 10:39:27PM +0000, Samuel Neves wrote:
> On Sun, Nov 27, 2022 at 10:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> >
> > Hi Thomas,
> >
> > On Sat, Nov 26, 2022 at 12:08:41AM +0100, Thomas Gleixner wrote:
> > > Jason!
> > >
> > > On Thu, Nov 24 2022 at 17:55, Jason A. Donenfeld wrote:
> > > > +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> > > > +/*
> > > > + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> > > > + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> > > > + * counter. Importantly does not spill to the stack. Its arguments are:
> > >
> > > Basic or not.
> >
> > Heh, FYI I didn't mean "basic" here as in "doesn't need a review", but
> > just that it's a straightforward technique and doesn't do any
> > complicated multiblock pyrotechnics (which frankly aren't really
> > needed).
> >
> > > This needs a Reviewed-by from someone who understands SSE2
> > > and ChaCha20 before this can go anywhere near the x86 tree.
> >
> > No problem. I'll see to it that somebody qualified gives this a review.
> >
> 
> I did look at this earlier. It looks fine. I would recommend changing
> 
> + /* copy1,copy2 = key */
> + movdqu 0x00(key),copy1
> + movdqu 0x10(key),copy2
> 
> to
> 
> + /* copy1,copy2 = key */
> + movups 0x00(key),copy1
> + movups 0x10(key),copy2
> 
> which has the same semantics, but saves a couple of code bytes. Likewise for
> 
> + movdqu state0,0x00(output)
> + movdqu state1,0x10(output)
> + movdqu state2,0x20(output)
> + movdqu state3,0x30(output)
> 
> Otherwise,
> 
> Reviewed-by: Samuel Neves <sneves@dei.uc.pt> # for vgetrandom-chacha.S

Thanks for the review and for the suggestion. Will do.

Jason
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 331e21ba961a..b64b1b1274ae 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -270,6 +270,7 @@  config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
 	select HAVE_GENERIC_VDSO
+	select HAVE_VDSO_GETRANDOM		if X86_64
 	select HOTPLUG_SMT			if SMP
 	select IRQ_FORCED_THREADING
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 3e88b9df8c8f..2de64e52236a 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -27,7 +27,7 @@  VDSO32-$(CONFIG_X86_32)		:= y
 VDSO32-$(CONFIG_IA32_EMULATION)	:= y
 
 # files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
 vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
 vobjs32-y += vdso32/vclock_gettime.o
 vobjs-$(CONFIG_X86_SGX)	+= vsgx.o
@@ -104,6 +104,7 @@  CFLAGS_REMOVE_vclock_gettime.o = -pg
 CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
 CFLAGS_REMOVE_vgetcpu.o = -pg
 CFLAGS_REMOVE_vsgx.o = -pg
+CFLAGS_REMOVE_vgetrandom.o = -pg
 
 #
 # X32 processes use x32 vDSO to access 64bit kernel data.
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 4bf48462fca7..1919cc39277e 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -28,6 +28,8 @@  VERSION {
 		clock_getres;
 		__vdso_clock_getres;
 		__vdso_sgx_enter_enclave;
+		getrandom;
+		__vdso_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..d1b986be3aa4
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
@@ -0,0 +1,179 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section	.rodata.cst16.CONSTANTS, "aM", @progbits, 16
+.align 16
+CONSTANTS:	.octa 0x6b20657479622d323320646e61707865
+.text
+
+/*
+ * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
+ * of blocks of output with a nonce of 0, taking an input key and 8-byte
+ * counter. Importantly does not spill to the stack. Its arguments are:
+ *
+ *	rdi: output bytes
+ *	rsi: 32-byte key input
+ *	rdx: 8-byte counter input/output
+ *	rcx: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(chacha20_blocks_nostack)
+
+#define output  %rdi
+#define key     %rsi
+#define counter %rdx
+#define nblocks %rcx
+#define i       %al
+#define state0  %xmm0
+#define state1  %xmm1
+#define state2  %xmm2
+#define state3  %xmm3
+#define copy0   %xmm4
+#define copy1   %xmm5
+#define copy2   %xmm6
+#define copy3   %xmm7
+#define temp    %xmm8
+#define one     %xmm9
+
+	/* copy0 = "expand 32-byte k" */
+	movaps		CONSTANTS(%rip),copy0
+	/* copy1,copy2 = key */
+	movdqu		0x00(key),copy1
+	movdqu		0x10(key),copy2
+	/* copy3 = counter || zero nonce */
+	movq		0x00(counter),copy3
+	/* one = 1 || 0 */
+	movq		$1,%rax
+	movq		%rax,one
+
+.Lblock:
+	/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
+	movdqa		copy0,state0
+	movdqa		copy1,state1
+	movdqa		copy2,state2
+	movdqa		copy3,state3
+
+	movb		$10,i
+.Lpermute:
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+	paddd		state1,state0
+	pxor		state0,state3
+	movdqa		state3,temp
+	pslld		$16,temp
+	psrld		$16,state3
+	por		temp,state3
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+	paddd		state3,state2
+	pxor		state2,state1
+	movdqa		state1,temp
+	pslld		$12,temp
+	psrld		$20,state1
+	por		temp,state1
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+	paddd		state1,state0
+	pxor		state0,state3
+	movdqa		state3,temp
+	pslld		$8,temp
+	psrld		$24,state3
+	por		temp,state3
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+	paddd		state3,state2
+	pxor		state2,state1
+	movdqa		state1,temp
+	pslld		$7,temp
+	psrld		$25,state1
+	por		temp,state1
+
+	/* state1 = shuffle32(state1, MASK(0, 3, 2, 1)) */
+	pshufd		$0x39,state1,state1
+	/* state2 = shuffle32(state2, MASK(1, 0, 3, 2)) */
+	pshufd		$0x4e,state2,state2
+	/* state3 = shuffle32(state3, MASK(2, 1, 0, 3)) */
+	pshufd		$0x93,state3,state3
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+	paddd		state1,state0
+	pxor		state0,state3
+	movdqa		state3,temp
+	pslld		$16,temp
+	psrld		$16,state3
+	por		temp,state3
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+	paddd		state3,state2
+	pxor		state2,state1
+	movdqa		state1,temp
+	pslld		$12,temp
+	psrld		$20,state1
+	por		temp,state1
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+	paddd		state1,state0
+	pxor		state0,state3
+	movdqa		state3,temp
+	pslld		$8,temp
+	psrld		$24,state3
+	por		temp,state3
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+	paddd		state3,state2
+	pxor		state2,state1
+	movdqa		state1,temp
+	pslld		$7,temp
+	psrld		$25,state1
+	por		temp,state1
+
+	/* state1 = shuffle32(state1, MASK(2, 1, 0, 3)) */
+	pshufd		$0x93,state1,state1
+	/* state2 = shuffle32(state2, MASK(1, 0, 3, 2)) */
+	pshufd		$0x4e,state2,state2
+	/* state3 = shuffle32(state3, MASK(0, 3, 2, 1)) */
+	pshufd		$0x39,state3,state3
+
+	decb		i
+	jnz		.Lpermute
+
+	/* output0 = state0 + copy0 */
+	paddd		copy0,state0
+	movdqu		state0,0x00(output)
+	/* output1 = state1 + copy1 */
+	paddd		copy1,state1
+	movdqu		state1,0x10(output)
+	/* output2 = state2 + copy2 */
+	paddd		copy2,state2
+	movdqu		state2,0x20(output)
+	/* output3 = state3 + copy3 */
+	paddd		copy3,state3
+	movdqu		state3,0x30(output)
+
+	/* ++copy3.counter */
+	paddq		one,copy3
+
+	/* output += 64, --nblocks */
+	addq		$64,output
+	decq		nblocks
+	jnz		.Lblock
+
+	/* counter = copy3.counter */
+	movq		copy3,0x00(counter)
+
+	/* Zero out all the regs, in case nothing uses these again. */
+	pxor		state0,state0
+	pxor		state1,state1
+	pxor		state2,state2
+	pxor		state3,state3
+	pxor		copy0,copy0
+	pxor		copy1,copy1
+	pxor		copy2,copy2
+	pxor		copy3,copy3
+	pxor		temp,temp
+
+	ret
+SYM_FUNC_END(chacha20_blocks_nostack)
diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c
new file mode 100644
index 000000000000..c7a2476d5d8a
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom.c
@@ -0,0 +1,18 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "../../../../lib/vdso/getrandom.c"
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state);
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state)
+{
+	return __cvdso_getrandom(buffer, len, flags, state);
+}
+
+ssize_t getrandom(void *, size_t, unsigned int, void *)
+	__attribute__((weak, alias("__vdso_getrandom")));
diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..099aca58ef20
--- /dev/null
+++ b/arch/x86/include/asm/vdso/getrandom.h
@@ -0,0 +1,49 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+#include <asm/vvar.h>
+
+static __always_inline ssize_t
+getrandom_syscall(void *buffer, size_t len, unsigned int flags)
+{
+	long ret;
+
+	asm ("syscall" : "=a" (ret) :
+	     "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
+	     "rcx", "r11", "memory");
+
+	return ret;
+}
+
+#define __vdso_rng_data (VVAR(_vdso_rng_data))
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
+		return (void *)&__vdso_rng_data +
+		       ((void *)&__timens_vdso_data - (void *)&__vdso_data);
+	return &__vdso_rng_data;
+}
+
+/*
+ * Generates a given positive number of block of ChaCha20 output with nonce=0,
+ * and does not write to any stack or memory outside of the parameters passed
+ * to it. This way, we don't need to worry about stack data leaking into forked
+ * child processes.
+ */
+static __always_inline void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks)
+{
+	extern void chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
+	return chacha20_blocks_nostack(dst_bytes, key, counter, nblocks);
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
index be199a9b2676..71c56586a22f 100644
--- a/arch/x86/include/asm/vdso/vsyscall.h
+++ b/arch/x86/include/asm/vdso/vsyscall.h
@@ -11,6 +11,8 @@ 
 #include <asm/vvar.h>
 
 DEFINE_VVAR(struct vdso_data, _vdso_data);
+DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data);
+
 /*
  * Update the vDSO data page to keep in sync with kernel timekeeping.
  */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 183e98e49ab9..9d9af37f7cab 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -26,6 +26,8 @@ 
  */
 #define DECLARE_VVAR(offset, type, name) \
 	EMIT_VVAR(name, offset)
+#define DECLARE_VVAR_SINGLE(offset, type, name) \
+	EMIT_VVAR(name, offset)
 
 #else
 
@@ -37,6 +39,10 @@  extern char __vvar_page;
 	extern type timens_ ## name[CS_BASES]				\
 	__attribute__((visibility("hidden")));				\
 
+#define DECLARE_VVAR_SINGLE(offset, type, name)				\
+	extern type vvar_ ## name					\
+	__attribute__((visibility("hidden")));				\
+
 #define VVAR(name) (vvar_ ## name)
 #define TIMENS(name) (timens_ ## name)
 
@@ -44,12 +50,22 @@  extern char __vvar_page;
 	type name[CS_BASES]						\
 	__attribute__((section(".vvar_" #name), aligned(16))) __visible
 
+#define DEFINE_VVAR_SINGLE(type, name)					\
+	type name							\
+	__attribute__((section(".vvar_" #name), aligned(16))) __visible
+
 #endif
 
 /* DECLARE_VVAR(offset, type, name) */
 
 DECLARE_VVAR(128, struct vdso_data, _vdso_data)
 
+#if !defined(_SINGLE_DATA)
+#define _SINGLE_DATA
+DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data)
+#endif
+
 #undef DECLARE_VVAR
+#undef DECLARE_VVAR_SINGLE
 
 #endif