diff mbox series

[RFT,v4,2/5] fork: Add shadow stack support to clone3()

Message ID 20231128-clone3-shadow-stack-v4-2-8b28ffe4f676@kernel.org (mailing list archive)
State New
Headers show
Series fork: Support shadow stacks in clone3() | expand

Commit Message

Mark Brown Nov. 28, 2023, 6:22 p.m. UTC
Unlike with the normal stack there is no API for configuring the the shadow
stack for a new thread, instead the kernel will dynamically allocate a new
shadow stack with the same size as the normal stack. This appears to be due
to the shadow stack series having been in development since before the more
extensible clone3() was added rather than anything more deliberate.

Add a parameter to clone3() specifying the size of a shadow stack for
the newly created process.  If no shadow stack is specified then the
existing implicit allocation behaviour is maintained.

If the architecture does not support shadow stacks the shadow stack size
parameter must be zero, architectures that do support the feature are
expected to enforce the same requirement on individual systems that lack
shadow stack support.

Update the existing x86 implementation to pay attention to the newly added
arguments, in order to maintain compatibility we use the existing behaviour
if no shadow stack is specified. Minimal validation is done of the supplied
parameters, detailed enforcement is left to when the thread is executed.
Since we are now using more fields from the kernel_clone_args we pass that
into the shadow stack code rather than individual fields.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/x86/include/asm/shstk.h | 11 +++++----
 arch/x86/kernel/process.c    |  2 +-
 arch/x86/kernel/shstk.c      | 56 ++++++++++++++++++++++++++++++--------------
 include/linux/sched/task.h   |  1 +
 include/uapi/linux/sched.h   |  4 ++++
 kernel/fork.c                | 53 +++++++++++++++++++++++++++++++----------
 6 files changed, 92 insertions(+), 35 deletions(-)

Comments

Deepak Gupta Nov. 28, 2023, 9:23 p.m. UTC | #1
On Tue, Nov 28, 2023 at 06:22:40PM +0000, Mark Brown wrote:
>Unlike with the normal stack there is no API for configuring the the shadow
>stack for a new thread, instead the kernel will dynamically allocate a new
>shadow stack with the same size as the normal stack. This appears to be due
>to the shadow stack series having been in development since before the more
>extensible clone3() was added rather than anything more deliberate.
>
>Add a parameter to clone3() specifying the size of a shadow stack for
>the newly created process.  If no shadow stack is specified then the
>existing implicit allocation behaviour is maintained.
>
>If the architecture does not support shadow stacks the shadow stack size
>parameter must be zero, architectures that do support the feature are
>expected to enforce the same requirement on individual systems that lack
>shadow stack support.
>
>Update the existing x86 implementation to pay attention to the newly added
>arguments, in order to maintain compatibility we use the existing behaviour
>if no shadow stack is specified. Minimal validation is done of the supplied
>parameters, detailed enforcement is left to when the thread is executed.
>Since we are now using more fields from the kernel_clone_args we pass that
>into the shadow stack code rather than individual fields.
>
>Signed-off-by: Mark Brown <broonie@kernel.org>
>---
> arch/x86/include/asm/shstk.h | 11 +++++----
> arch/x86/kernel/process.c    |  2 +-
> arch/x86/kernel/shstk.c      | 56 ++++++++++++++++++++++++++++++--------------
> include/linux/sched/task.h   |  1 +
> include/uapi/linux/sched.h   |  4 ++++
> kernel/fork.c                | 53 +++++++++++++++++++++++++++++++----------
> 6 files changed, 92 insertions(+), 35 deletions(-)
>
>diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
>index 42fee8959df7..8be7b0a909c3 100644
>--- a/arch/x86/include/asm/shstk.h
>+++ b/arch/x86/include/asm/shstk.h
>@@ -6,6 +6,7 @@
> #include <linux/types.h>
>
> struct task_struct;
>+struct kernel_clone_args;
> struct ksignal;
>
> #ifdef CONFIG_X86_USER_SHADOW_STACK
>@@ -16,8 +17,8 @@ struct thread_shstk {
>
> long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
> void reset_thread_features(void);
>-unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
>-				       unsigned long stack_size);
>+unsigned long shstk_alloc_thread_stack(struct task_struct *p,
>+				       const struct kernel_clone_args *args);
> void shstk_free(struct task_struct *p);
> int setup_signal_shadow_stack(struct ksignal *ksig);
> int restore_signal_shadow_stack(void);
>@@ -26,8 +27,10 @@ static inline long shstk_prctl(struct task_struct *task, int option,
> 			       unsigned long arg2) { return -EINVAL; }
> static inline void reset_thread_features(void) {}
> static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
>-						     unsigned long clone_flags,
>-						     unsigned long stack_size) { return 0; }
>+						     const struct kernel_clone_args *args)
>+{
>+	return 0;
>+}
> static inline void shstk_free(struct task_struct *p) {}
> static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
> static inline int restore_signal_shadow_stack(void) { return 0; }
>diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>index b6f4e8399fca..a9ca80ea5056 100644
>--- a/arch/x86/kernel/process.c
>+++ b/arch/x86/kernel/process.c
>@@ -207,7 +207,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
> 	 * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
> 	 * update it.
> 	 */
>-	new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
>+	new_ssp = shstk_alloc_thread_stack(p, args);
> 	if (IS_ERR_VALUE(new_ssp))
> 		return PTR_ERR((void *)new_ssp);
>
>diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
>index 59e15dd8d0f8..0d1325d2d94a 100644
>--- a/arch/x86/kernel/shstk.c
>+++ b/arch/x86/kernel/shstk.c
>@@ -191,38 +191,58 @@ void reset_thread_features(void)
> 	current->thread.features_locked = 0;
> }
>
>-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
>-				       unsigned long stack_size)
>+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
>+				       const struct kernel_clone_args *args)
> {
> 	struct thread_shstk *shstk = &tsk->thread.shstk;
>+	unsigned long clone_flags = args->flags;
> 	unsigned long addr, size;
>
> 	/*
> 	 * If shadow stack is not enabled on the new thread, skip any
>-	 * switch to a new shadow stack.
>+	 * implicit switch to a new shadow stack and reject attempts to
>+	 * explciitly specify one.
> 	 */
>-	if (!features_enabled(ARCH_SHSTK_SHSTK))
>-		return 0;
>+	if (!features_enabled(ARCH_SHSTK_SHSTK)) {
>+		if (args->shadow_stack_size)
>+			return (unsigned long)ERR_PTR(-EINVAL);
>
>-	/*
>-	 * For CLONE_VFORK the child will share the parents shadow stack.
>-	 * Make sure to clear the internal tracking of the thread shadow
>-	 * stack so the freeing logic run for child knows to leave it alone.
>-	 */
>-	if (clone_flags & CLONE_VFORK) {
>-		shstk->base = 0;
>-		shstk->size = 0;
> 		return 0;
> 	}
>
> 	/*
>-	 * For !CLONE_VM the child will use a copy of the parents shadow
>-	 * stack.
>+	 * If the user specified a shadow stack then do some basic
>+	 * validation and use it, otherwise fall back to a default
>+	 * shadow stack size if the clone_flags don't indicate an
>+	 * allocation is unneeded.
> 	 */
>-	if (!(clone_flags & CLONE_VM))
>-		return 0;
>+	if (args->shadow_stack_size) {
>+		size = args->shadow_stack_size;
>+	} else {
>+		/*
>+		 * For CLONE_VFORK the child will share the parents
>+		 * shadow stack.  Make sure to clear the internal
>+		 * tracking of the thread shadow stack so the freeing
>+		 * logic run for child knows to leave it alone.
>+		 */
>+		if (clone_flags & CLONE_VFORK) {
>+			shstk->base = 0;
>+			shstk->size = 0;
>+			return 0;
>+		}
>+
>+		/*
>+		 * For !CLONE_VM the child will use a copy of the
>+		 * parents shadow stack.
>+		 */
>+		if (!(clone_flags & CLONE_VM))
>+			return 0;
>+
>+		size = args->stack_size;
>+
>+	}
>
>-	size = adjust_shstk_size(stack_size);
>+	size = adjust_shstk_size(size);
> 	addr = alloc_shstk(0, size, 0, false);
> 	if (IS_ERR_VALUE(addr))
> 		return addr;
>diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
>index a23af225c898..e86a09cfccd8 100644
>--- a/include/linux/sched/task.h
>+++ b/include/linux/sched/task.h
>@@ -41,6 +41,7 @@ struct kernel_clone_args {
> 	void *fn_arg;
> 	struct cgroup *cgrp;
> 	struct css_set *cset;
>+	unsigned long shadow_stack_size;
> };
>
> /*
>diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
>index 3bac0a8ceab2..a998b6d0c897 100644
>--- a/include/uapi/linux/sched.h
>+++ b/include/uapi/linux/sched.h
>@@ -84,6 +84,8 @@
>  *                kernel's limit of nested PID namespaces.
>  * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
>  *                a file descriptor for the cgroup.
>+ * @shadow_stack_size: Specify the size of the shadow stack to allocate
>+ *                     for the child process.
>  *
>  * The structure is versioned by size and thus extensible.
>  * New struct members must go at the end of the struct and
>@@ -101,12 +103,14 @@ struct clone_args {
> 	__aligned_u64 set_tid;
> 	__aligned_u64 set_tid_size;
> 	__aligned_u64 cgroup;
>+	__aligned_u64 shadow_stack_size;
> };
> #endif
>
> #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
> #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
> #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
>+#define CLONE_ARGS_SIZE_VER3 96 /* sizeof fourth published struct */
>
> /*
>  * Scheduling policies
>diff --git a/kernel/fork.c b/kernel/fork.c
>index 10917c3e1f03..35131acd43d2 100644
>--- a/kernel/fork.c
>+++ b/kernel/fork.c
>@@ -121,6 +121,11 @@
>  */
> #define MAX_THREADS FUTEX_TID_MASK
>
>+/*
>+ * Require that shadow stacks can store at least one element
>+ */
>+#define SHADOW_STACK_SIZE_MIN 8

nit:
Sorry, should've mentioned it earlier.
Can this be "#define SHADOW_STACK_SIZE_MIN sizeof(unsigned long)"


>+
> /*
>  * Protected counters by write_lock_irq(&tasklist_lock)
>  */
>@@ -3067,7 +3072,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> 		     CLONE_ARGS_SIZE_VER1);
> 	BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
> 		     CLONE_ARGS_SIZE_VER2);
>-	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
Mark Brown Nov. 29, 2023, 1:05 p.m. UTC | #2
On Tue, Nov 28, 2023 at 01:23:57PM -0800, Deepak Gupta wrote:
> On Tue, Nov 28, 2023 at 06:22:40PM +0000, Mark Brown wrote:

> > +#define SHADOW_STACK_SIZE_MIN 8

> nit:
> Sorry, should've mentioned it earlier.
> Can this be "#define SHADOW_STACK_SIZE_MIN sizeof(unsigned long)"

Given that the stack is full of pointers I'll go for void * instead.
Edgecombe, Rick P Dec. 5, 2023, 12:26 a.m. UTC | #3
On Tue, 2023-11-28 at 18:22 +0000, Mark Brown wrote:
> -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
> unsigned long clone_flags,
> -                                      unsigned long stack_size)
> +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
> +                                      const struct kernel_clone_args
> *args)
>  {
>         struct thread_shstk *shstk = &tsk->thread.shstk;
> +       unsigned long clone_flags = args->flags;
>         unsigned long addr, size;
>  
>         /*
>          * If shadow stack is not enabled on the new thread, skip any
> -        * switch to a new shadow stack.
> +        * implicit switch to a new shadow stack and reject attempts
> to
> +        * explciitly specify one.
>          */
> -       if (!features_enabled(ARCH_SHSTK_SHSTK))
> -               return 0;
> +       if (!features_enabled(ARCH_SHSTK_SHSTK)) {
> +               if (args->shadow_stack_size)
> +                       return (unsigned long)ERR_PTR(-EINVAL);
>  
> -       /*
> -        * For CLONE_VFORK the child will share the parents shadow
> stack.
> -        * Make sure to clear the internal tracking of the thread
> shadow
> -        * stack so the freeing logic run for child knows to leave it
> alone.
> -        */
> -       if (clone_flags & CLONE_VFORK) {
> -               shstk->base = 0;
> -               shstk->size = 0;
>                 return 0;
>         }
>  
>         /*
> -        * For !CLONE_VM the child will use a copy of the parents
> shadow
> -        * stack.
> +        * If the user specified a shadow stack then do some basic
> +        * validation and use it, otherwise fall back to a default
> +        * shadow stack size if the clone_flags don't indicate an
> +        * allocation is unneeded.
>          */
> -       if (!(clone_flags & CLONE_VM))
> -               return 0;
> +       if (args->shadow_stack_size) {
> +               size = args->shadow_stack_size;
> +       } else {
> +               /*
> +                * For CLONE_VFORK the child will share the parents
> +                * shadow stack.  Make sure to clear the internal
> +                * tracking of the thread shadow stack so the freeing
> +                * logic run for child knows to leave it alone.
> +                */
> +               if (clone_flags & CLONE_VFORK) {
> +                       shstk->base = 0;
> +                       shstk->size = 0;
> +                       return 0;
> +               }
> +
> +               /*
> +                * For !CLONE_VM the child will use a copy of the
> +                * parents shadow stack.
> +                */
> +               if (!(clone_flags & CLONE_VM))
> +                       return 0;
> +
> +               size = args->stack_size;
> +
> +       }
>  
> -       size = adjust_shstk_size(stack_size);
> +       size = adjust_shstk_size(size);
>         addr = alloc_shstk(0, size, 0, false);

Hmm. I didn't test this, but in the copy_process(), copy_mm() happens
before this point. So the shadow stack would get mapped in current's MM
(i.e. the parent). So in the !CLONE_VM case with shadow_stack_size!=0
the SSP in the child will be updated to an area that is not mapped in
the child. I think we need to pass tsk->mm into alloc_shstk(). But such
an exotic clone usage does give me pause, regarding whether all of this
is premature.

Otherwise it looked ok from the x86/shstk perspective.

>         if (IS_ERR_VALUE(addr))
>                 return addr;
Mark Brown Dec. 5, 2023, 3:51 p.m. UTC | #4
On Tue, Dec 05, 2023 at 12:26:57AM +0000, Edgecombe, Rick P wrote:
> On Tue, 2023-11-28 at 18:22 +0000, Mark Brown wrote:

> > -       size = adjust_shstk_size(stack_size);
> > +       size = adjust_shstk_size(size);
> >         addr = alloc_shstk(0, size, 0, false);

> Hmm. I didn't test this, but in the copy_process(), copy_mm() happens
> before this point. So the shadow stack would get mapped in current's MM
> (i.e. the parent). So in the !CLONE_VM case with shadow_stack_size!=0
> the SSP in the child will be updated to an area that is not mapped in
> the child. I think we need to pass tsk->mm into alloc_shstk(). But such
> an exotic clone usage does give me pause, regarding whether all of this
> is premature.

Hrm, right.  And we then can't use do_mmap() either.  I'd be somewhat
tempted to disallow that specific case for now rather than deal with it
though that's not really in the spirit of just always following what the
user asked for.
Edgecombe, Rick P Dec. 5, 2023, 10:23 p.m. UTC | #5
On Tue, 2023-12-05 at 15:51 +0000, Mark Brown wrote:
> On Tue, Dec 05, 2023 at 12:26:57AM +0000, Edgecombe, Rick P wrote:
> > On Tue, 2023-11-28 at 18:22 +0000, Mark Brown wrote:
> 
> > > -       size = adjust_shstk_size(stack_size);
> > > +       size = adjust_shstk_size(size);
> > >         addr = alloc_shstk(0, size, 0, false);
> 
> > Hmm. I didn't test this, but in the copy_process(), copy_mm()
> > happens
> > before this point. So the shadow stack would get mapped in
> > current's MM
> > (i.e. the parent). So in the !CLONE_VM case with
> > shadow_stack_size!=0
> > the SSP in the child will be updated to an area that is not mapped
> > in
> > the child. I think we need to pass tsk->mm into alloc_shstk(). But
> > such
> > an exotic clone usage does give me pause, regarding whether all of
> > this
> > is premature.
> 
> Hrm, right.  And we then can't use do_mmap() either.  I'd be somewhat
> tempted to disallow that specific case for now rather than deal with
> it
> though that's not really in the spirit of just always following what
> the
> user asked for.

Oh, yea. What a pain. It doesn't seem like we could easily even add a
do_mmap() variant that takes an mm either.

I did a quick logging test on a Fedora userspace. systemd (I think)
appears to do a clone(!CLONE_VM) with a stack passed. So maybe the
combo might actually get used with a shadow_stack_size if it used
clone3 some day. At the same time, fixing clone to mmap() in the child
doesn't seem straight forward at all. Checking with some of our MM
folks, the suggestion was to look at doing the child's shadow stack
mapping in dup_mm() to avoid tripping over complications that happen
when a remote MM becomes more "live".

If we just punt on this combination for now, then the documented rules
for args->shadow_stack_size would be something like:
clone3 will use the parents shadow stack when CLONE_VM is not present.
If CLONE_VFORK is set then it will use the parents shadow stack only
when args->shadow_stack_size is non-zero. In the cases when the parents
shadow stack is not used, args->shadow_stack_size is used for the size
whenever non-zero.

I guess it doesn't seem too overly complicated. But I'm not thinking
any of the options seem great. I'd unhappily lean towards not
supporting shadow_stack_size!=0 && !CLONE_VM for now. But it seems like
there may be a user for the unsupported case, so this would be just
improving things a little and kicking the can down the road. I also
wonder if this is a sign to reconsider the earlier token consuming
design.
Mark Brown Dec. 6, 2023, 6:24 p.m. UTC | #6
On Tue, Dec 05, 2023 at 10:23:08PM +0000, Edgecombe, Rick P wrote:
> On Tue, 2023-12-05 at 15:51 +0000, Mark Brown wrote:

> > Hrm, right.  And we then can't use do_mmap() either.  I'd be somewhat
> > tempted to disallow that specific case for now rather than deal with
> > it
> > though that's not really in the spirit of just always following what
> > the
> > user asked for.

> Oh, yea. What a pain. It doesn't seem like we could easily even add a
> do_mmap() variant that takes an mm either.

> I did a quick logging test on a Fedora userspace. systemd (I think)
> appears to do a clone(!CLONE_VM) with a stack passed. So maybe the
> combo might actually get used with a shadow_stack_size if it used
> clone3 some day. At the same time, fixing clone to mmap() in the child
> doesn't seem straight forward at all. Checking with some of our MM
> folks, the suggestion was to look at doing the child's shadow stack
> mapping in dup_mm() to avoid tripping over complications that happen
> when a remote MM becomes more "live".

Yeah, I can't see anything that looks particularly tasteful.

> If we just punt on this combination for now, then the documented rules
> for args->shadow_stack_size would be something like:
> clone3 will use the parents shadow stack when CLONE_VM is not present.
> If CLONE_VFORK is set then it will use the parents shadow stack only
> when args->shadow_stack_size is non-zero. In the cases when the parents
> shadow stack is not used, args->shadow_stack_size is used for the size
> whenever non-zero.

> I guess it doesn't seem too overly complicated. But I'm not thinking
> any of the options seem great. I'd unhappily lean towards not

Indeed, it's all really hard to get enthusiastic about.

> supporting shadow_stack_size!=0 && !CLONE_VM for now. But it seems like
> there may be a user for the unsupported case, so this would be just
> improving things a little and kicking the can down the road. I also
> wonder if this is a sign to reconsider the earlier token consuming
> design.

In the case where we have !CLONE_VM it should actually possible to reuse
the token (since the user is in at least some sense the child process
rather than the parent) so it's less pure overhead, providing you don't
mind the children of a given parent all using the same addresses for
their initial shadow stack.

I'll have a poke at the various options and come up with something,
hopefully this month but it's getting a bit busy so might be early 
next year instead.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 42fee8959df7..8be7b0a909c3 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -6,6 +6,7 @@ 
 #include <linux/types.h>
 
 struct task_struct;
+struct kernel_clone_args;
 struct ksignal;
 
 #ifdef CONFIG_X86_USER_SHADOW_STACK
@@ -16,8 +17,8 @@  struct thread_shstk {
 
 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
 void reset_thread_features(void);
-unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
-				       unsigned long stack_size);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+				       const struct kernel_clone_args *args);
 void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
@@ -26,8 +27,10 @@  static inline long shstk_prctl(struct task_struct *task, int option,
 			       unsigned long arg2) { return -EINVAL; }
 static inline void reset_thread_features(void) {}
 static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
-						     unsigned long clone_flags,
-						     unsigned long stack_size) { return 0; }
+						     const struct kernel_clone_args *args)
+{
+	return 0;
+}
 static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6f4e8399fca..a9ca80ea5056 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -207,7 +207,7 @@  int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	 * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
 	 * update it.
 	 */
-	new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+	new_ssp = shstk_alloc_thread_stack(p, args);
 	if (IS_ERR_VALUE(new_ssp))
 		return PTR_ERR((void *)new_ssp);
 
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 59e15dd8d0f8..0d1325d2d94a 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -191,38 +191,58 @@  void reset_thread_features(void)
 	current->thread.features_locked = 0;
 }
 
-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
-				       unsigned long stack_size)
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
+				       const struct kernel_clone_args *args)
 {
 	struct thread_shstk *shstk = &tsk->thread.shstk;
+	unsigned long clone_flags = args->flags;
 	unsigned long addr, size;
 
 	/*
 	 * If shadow stack is not enabled on the new thread, skip any
-	 * switch to a new shadow stack.
+	 * implicit switch to a new shadow stack and reject attempts to
+	 * explciitly specify one.
 	 */
-	if (!features_enabled(ARCH_SHSTK_SHSTK))
-		return 0;
+	if (!features_enabled(ARCH_SHSTK_SHSTK)) {
+		if (args->shadow_stack_size)
+			return (unsigned long)ERR_PTR(-EINVAL);
 
-	/*
-	 * For CLONE_VFORK the child will share the parents shadow stack.
-	 * Make sure to clear the internal tracking of the thread shadow
-	 * stack so the freeing logic run for child knows to leave it alone.
-	 */
-	if (clone_flags & CLONE_VFORK) {
-		shstk->base = 0;
-		shstk->size = 0;
 		return 0;
 	}
 
 	/*
-	 * For !CLONE_VM the child will use a copy of the parents shadow
-	 * stack.
+	 * If the user specified a shadow stack then do some basic
+	 * validation and use it, otherwise fall back to a default
+	 * shadow stack size if the clone_flags don't indicate an
+	 * allocation is unneeded.
 	 */
-	if (!(clone_flags & CLONE_VM))
-		return 0;
+	if (args->shadow_stack_size) {
+		size = args->shadow_stack_size;
+	} else {
+		/*
+		 * For CLONE_VFORK the child will share the parents
+		 * shadow stack.  Make sure to clear the internal
+		 * tracking of the thread shadow stack so the freeing
+		 * logic run for child knows to leave it alone.
+		 */
+		if (clone_flags & CLONE_VFORK) {
+			shstk->base = 0;
+			shstk->size = 0;
+			return 0;
+		}
+
+		/*
+		 * For !CLONE_VM the child will use a copy of the
+		 * parents shadow stack.
+		 */
+		if (!(clone_flags & CLONE_VM))
+			return 0;
+
+		size = args->stack_size;
+
+	}
 
-	size = adjust_shstk_size(stack_size);
+	size = adjust_shstk_size(size);
 	addr = alloc_shstk(0, size, 0, false);
 	if (IS_ERR_VALUE(addr))
 		return addr;
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..e86a09cfccd8 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -41,6 +41,7 @@  struct kernel_clone_args {
 	void *fn_arg;
 	struct cgroup *cgrp;
 	struct css_set *cset;
+	unsigned long shadow_stack_size;
 };
 
 /*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..a998b6d0c897 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -84,6 +84,8 @@ 
  *                kernel's limit of nested PID namespaces.
  * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
  *                a file descriptor for the cgroup.
+ * @shadow_stack_size: Specify the size of the shadow stack to allocate
+ *                     for the child process.
  *
  * The structure is versioned by size and thus extensible.
  * New struct members must go at the end of the struct and
@@ -101,12 +103,14 @@  struct clone_args {
 	__aligned_u64 set_tid;
 	__aligned_u64 set_tid_size;
 	__aligned_u64 cgroup;
+	__aligned_u64 shadow_stack_size;
 };
 #endif
 
 #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
 #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
 #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#define CLONE_ARGS_SIZE_VER3 96 /* sizeof fourth published struct */
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..35131acd43d2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -121,6 +121,11 @@ 
  */
 #define MAX_THREADS FUTEX_TID_MASK
 
+/*
+ * Require that shadow stacks can store at least one element
+ */
+#define SHADOW_STACK_SIZE_MIN 8
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -3067,7 +3072,9 @@  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		     CLONE_ARGS_SIZE_VER1);
 	BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
 		     CLONE_ARGS_SIZE_VER2);
-	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+	BUILD_BUG_ON(offsetofend(struct clone_args, shadow_stack_size) !=
+		     CLONE_ARGS_SIZE_VER3);
+	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER3);
 
 	if (unlikely(usize > PAGE_SIZE))
 		return -E2BIG;
@@ -3100,16 +3107,17 @@  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		return -EINVAL;
 
 	*kargs = (struct kernel_clone_args){
-		.flags		= args.flags,
-		.pidfd		= u64_to_user_ptr(args.pidfd),
-		.child_tid	= u64_to_user_ptr(args.child_tid),
-		.parent_tid	= u64_to_user_ptr(args.parent_tid),
-		.exit_signal	= args.exit_signal,
-		.stack		= args.stack,
-		.stack_size	= args.stack_size,
-		.tls		= args.tls,
-		.set_tid_size	= args.set_tid_size,
-		.cgroup		= args.cgroup,
+		.flags			= args.flags,
+		.pidfd			= u64_to_user_ptr(args.pidfd),
+		.child_tid		= u64_to_user_ptr(args.child_tid),
+		.parent_tid		= u64_to_user_ptr(args.parent_tid),
+		.exit_signal		= args.exit_signal,
+		.stack			= args.stack,
+		.stack_size		= args.stack_size,
+		.tls			= args.tls,
+		.set_tid_size		= args.set_tid_size,
+		.cgroup			= args.cgroup,
+		.shadow_stack_size	= args.shadow_stack_size,
 	};
 
 	if (args.set_tid &&
@@ -3150,6 +3158,27 @@  static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
 	return true;
 }
 
+/**
+ * clone3_shadow_stack_valid - check and prepare shadow stack
+ * @kargs: kernel clone args
+ *
+ * Verify that shadow stacks are only enabled if supported.
+ */
+static inline bool clone3_shadow_stack_valid(struct kernel_clone_args *kargs)
+{
+	if (!kargs->shadow_stack_size)
+		return true;
+
+	if (kargs->shadow_stack_size < SHADOW_STACK_SIZE_MIN)
+		return false;
+
+	if (kargs->shadow_stack_size > rlimit(RLIMIT_STACK))
+		return false;
+
+	/* The architecture must check support on the specific machine */
+	return IS_ENABLED(CONFIG_ARCH_HAS_USER_SHADOW_STACK);
+}
+
 static bool clone3_args_valid(struct kernel_clone_args *kargs)
 {
 	/* Verify that no unknown flags are passed along. */
@@ -3172,7 +3201,7 @@  static bool clone3_args_valid(struct kernel_clone_args *kargs)
 	    kargs->exit_signal)
 		return false;
 
-	if (!clone3_stack_valid(kargs))
+	if (!clone3_stack_valid(kargs) || !clone3_shadow_stack_valid(kargs))
 		return false;
 
 	return true;