diff mbox series

[v6,1/9] kernel: Support TIF_SYSCALL_INTERCEPT flag

Message ID 20200904203147.2908430-2-krisman@collabora.com (mailing list archive)
State New
Headers show
Series Syscall User Dispatch | expand

Commit Message

Gabriel Krisman Bertazi Sept. 4, 2020, 8:31 p.m. UTC
Convert TIF_SECCOMP into a generic TI flag for any syscall interception
work being done by the kernel.  The actual type of work is exposed by a
new flag field outside of thread_info.  This ensures that the
syscall_intercept field is only accessed if struct seccomp has to be
accessed already, such that it doesn't incur in a much higher cost to
the seccomp path.

In order to avoid modifying every architecture at once, this patch has a
transition mechanism, such that architectures that define TIF_SECCOMP
continue to work by ignoring the syscall_intercept flag, as long as they
don't support other syscall interception mechanisms like the future
syscall user dispatch.  When migrating TIF_SECCOMP to
TIF_SYSCALL_INTERCEPT, they should adopt the semantics of checking the
syscall_intercept flag, like it is done in the common entry syscall
code, or even better, migrate to the common syscall entry code.

This was tested by running the selftests for seccomp.  No regressions
were observed, and all tests passed (with and without this patch).

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 include/linux/sched.h             |  6 ++-
 include/linux/seccomp.h           | 20 ++++++++-
 include/linux/syscall_intercept.h | 70 +++++++++++++++++++++++++++++++
 kernel/fork.c                     | 10 ++++-
 kernel/seccomp.c                  |  7 ++--
 5 files changed, 106 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/syscall_intercept.h

Comments

Christian Brauner Sept. 7, 2020, 10:16 a.m. UTC | #1
On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
> Convert TIF_SECCOMP into a generic TI flag for any syscall interception
> work being done by the kernel.  The actual type of work is exposed by a
> new flag field outside of thread_info.  This ensures that the
> syscall_intercept field is only accessed if struct seccomp has to be
> accessed already, such that it doesn't incur in a much higher cost to
> the seccomp path.
> 
> In order to avoid modifying every architecture at once, this patch has a
> transition mechanism, such that architectures that define TIF_SECCOMP
> continue to work by ignoring the syscall_intercept flag, as long as they
> don't support other syscall interception mechanisms like the future
> syscall user dispatch.  When migrating TIF_SECCOMP to
> TIF_SYSCALL_INTERCEPT, they should adopt the semantics of checking the
> syscall_intercept flag, like it is done in the common entry syscall
> code, or even better, migrate to the common syscall entry code.
> 
> This was tested by running the selftests for seccomp.  No regressions
> were observed, and all tests passed (with and without this patch).
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> ---
>  include/linux/sched.h             |  6 ++-
>  include/linux/seccomp.h           | 20 ++++++++-
>  include/linux/syscall_intercept.h | 70 +++++++++++++++++++++++++++++++
>  kernel/fork.c                     | 10 ++++-
>  kernel/seccomp.c                  |  7 ++--
>  5 files changed, 106 insertions(+), 7 deletions(-)
>  create mode 100644 include/linux/syscall_intercept.h
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index afe01e232935..3511c98a7849 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -959,7 +959,11 @@ struct task_struct {
>  	kuid_t				loginuid;
>  	unsigned int			sessionid;
>  #endif
> -	struct seccomp			seccomp;
> +
> +	struct {
> +		unsigned int			syscall_intercept;
> +		struct seccomp			seccomp;
> +	};

If there's no specific reason to do this I'd not wrap this in an
anonymous struct. It doesn't really buy anything and there doesn't seem
to be  precedent in struct task_struct right now. Also, if this somehow
adds padding it seems you might end up increasing the size of struct
task_struct more than necessary by accident? (I might be wrong though.)

>  
>  	/* Thread group tracking: */
>  	u64				parent_exec_id;
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 02aef2844c38..027dc462cea9 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -20,6 +20,24 @@
>  #include <linux/atomic.h>
>  #include <asm/seccomp.h>
>  
> +/*
> + * Some transitional defines to avoid migrating every architecture code
> + * at once.
> + */
> +
> +#if defined(TIF_SECCOMP) && defined(TIF_SYSCALL_INTERCEPT)
> +# error "TIF_SYSCALL_INTERCEPT and TIF_SECCOMP can't be defined at the same time"
> +#endif
> +
> +/*
> + * If the arch has not transitioned to TIF_SYSCALL_INTERCEPT, this let
> + * seccomp work with these architectures, as long as no other syscall
> + * intercept features are meant to be supported.
> + */
> +#ifdef TIF_SECCOMP
> +# define TIF_SYSCALL_INTERCEPT TIF_SECCOMP
> +#endif
> +
>  struct seccomp_filter;
>  /**
>   * struct seccomp - the state of a seccomp'ed process
> @@ -42,7 +60,7 @@ struct seccomp {
>  extern int __secure_computing(const struct seccomp_data *sd);
>  static inline int secure_computing(void)
>  {
> -	if (unlikely(test_thread_flag(TIF_SECCOMP)))
> +	if (unlikely(test_thread_flag(TIF_SYSCALL_INTERCEPT)))
>  		return  __secure_computing(NULL);
>  	return 0;
>  }
> diff --git a/include/linux/syscall_intercept.h b/include/linux/syscall_intercept.h
> new file mode 100644
> index 000000000000..725d157699da
> --- /dev/null
> +++ b/include/linux/syscall_intercept.h
> @@ -0,0 +1,70 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020 Collabora Ltd.
> + */
> +#ifndef _SYSCALL_INTERCEPT_H
> +#define _SYSCALL_INTERCEPT_H
> +
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/thread_info.h>
> +
> +#define SYSINT_SECCOMP		0x1

<bikeshed>

Can we maybe use a better name for this? I noone minds the extra
characters I'd suggest:
SYSCALL_INTERCEPT_SECCOMP
or
SYS_INTERCEPT_SECCOMP

</bikeshed>

> +
> +#ifdef TIF_SYSCALL_INTERCEPT
> +
> +/* seccomp (at least) can modify TIF_SYSCALL_INTERCEPT from a different
> + * thread, which means it can race with itself or with
> + * syscall_user_dispatch. Therefore, TIF_SYSCALL_INTERCEPT and
> + * syscall_intercept are synchronized by tsk->sighand->siglock.
> + */
> +
> +static inline void __set_tsk_syscall_intercept(struct task_struct *tsk,
> +					   unsigned int type)
> +{
> +	tsk->syscall_intercept |= type;
> +
> +	if (tsk->syscall_intercept)
> +		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
> +}
> +
> +static inline void __clear_tsk_syscall_intercept(struct task_struct *tsk,
> +					     unsigned int type)
> +{
> +	tsk->syscall_intercept &= ~type;
> +
> +	if (tsk->syscall_intercept == 0)
> +		clear_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
> +}
> +
> +static inline void set_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
> +{
> +	spin_lock_irq(&tsk->sighand->siglock);
> +	__set_tsk_syscall_intercept(tsk, type);
> +	spin_unlock_irq(&tsk->sighand->siglock);
> +}
> +
> +static inline void clear_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
> +{
> +	spin_lock_irq(&tsk->sighand->siglock);
> +	__clear_tsk_syscall_intercept(tsk, type);
> +	spin_unlock_irq(&tsk->sighand->siglock);
> +}
> +
> +#else
> +static inline void __set_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
> +{
> +}
> +static inline void set_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
> +{
> +}
> +static inline void __clear_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
> +{
> +}
> +static inline void clear_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
> +{
> +}
> +#endif
> +
> +#endif
> +
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4d32190861bd..a39177bed8ea 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -49,7 +49,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/security.h>
>  #include <linux/hugetlb.h>
> -#include <linux/seccomp.h>
> +#include <linux/syscall_intercept.h>
>  #include <linux/swap.h>
>  #include <linux/syscalls.h>
>  #include <linux/jiffies.h>
> @@ -898,6 +898,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  	 * the usage counts on the error path calling free_task.
>  	 */
>  	tsk->seccomp.filter = NULL;
> +	tsk->syscall_intercept = 0;
>  #endif
>  
>  	setup_thread_stack(tsk, orig);
> @@ -1620,9 +1621,14 @@ static void copy_seccomp(struct task_struct *p)
>  	 * If the parent gained a seccomp mode after copying thread
>  	 * flags and between before we held the sighand lock, we have
>  	 * to manually enable the seccomp thread flag here.
> +	 *
> +	 * In addition current sighand lock is asserted, so it is safe
> +	 * to use the unlocked version of set_tsk_syscall_intercept.
>  	 */
>  	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
> -		set_tsk_thread_flag(p, TIF_SECCOMP);
> +		__set_tsk_syscall_intercept(p, SYSINT_SECCOMP);
> +	else
> +		__clear_tsk_syscall_intercept(p, SYSINT_SECCOMP);
>  #endif
>  }
>  
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 3ee59ce0a323..d0643b500f2e 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -28,6 +28,7 @@
>  #include <linux/slab.h>
>  #include <linux/syscalls.h>
>  #include <linux/sysctl.h>
> +#include <linux/syscall_intercept.h>
>  
>  #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
>  #include <asm/syscall.h>
> @@ -352,14 +353,14 @@ static inline void seccomp_assign_mode(struct task_struct *task,
>  
>  	task->seccomp.mode = seccomp_mode;
>  	/*
> -	 * Make sure TIF_SECCOMP cannot be set before the mode (and
> +	 * Make sure SYSINT_SECCOMP cannot be set before the mode (and
>  	 * filter) is set.
>  	 */
>  	smp_mb__before_atomic();
>  	/* Assume default seccomp processes want spec flaw mitigation. */
>  	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
>  		arch_seccomp_spec_mitigate(task);
> -	set_tsk_thread_flag(task, TIF_SECCOMP);
> +	__set_tsk_syscall_intercept(task, SYSINT_SECCOMP);
>  }
>  
>  #ifdef CONFIG_SECCOMP_FILTER
> @@ -925,7 +926,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
>  
>  	/*
>  	 * Make sure that any changes to mode from another thread have
> -	 * been seen after TIF_SECCOMP was seen.
> +	 * been seen after SYSINT_SECCOMP was seen.
>  	 */
>  	rmb();
>  
> -- 
> 2.28.0
Gabriel Krisman Bertazi Sept. 8, 2020, 4:59 a.m. UTC | #2
Christian Brauner <christian.brauner@ubuntu.com> writes:

> On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
>> index afe01e232935..3511c98a7849 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,7 +959,11 @@ struct task_struct {
>>  	kuid_t				loginuid;
>>  	unsigned int			sessionid;
>>  #endif
>> -	struct seccomp			seccomp;
>> +
>> +	struct {
>> +		unsigned int			syscall_intercept;
>> +		struct seccomp			seccomp;
>> +	};
>
> If there's no specific reason to do this I'd not wrap this in an
> anonymous struct. It doesn't really buy anything and there doesn't seem
> to be  precedent in struct task_struct right now. Also, if this somehow
> adds padding it seems you might end up increasing the size of struct
> task_struct more than necessary by accident? (I might be wrong
> though.)

Hi Christian,

Thanks for your review on this and on the other patches of this series.

I wrapped these to prevent struct layout randomization from separating
the flags field from seccomp, as they are going to be used together and
I was trying to reduce overhead to seccomp entry due to two cache misses
when reading this structure.  Measuring it seccomp_benchmark didn't show
any difference with the unwrapped version, so perhaps it was a bit of
premature optimization?

>> diff --git a/include/linux/syscall_intercept.h b/include/linux/syscall_intercept.h
>> new file mode 100644
>> index 000000000000..725d157699da
>> --- /dev/null
>> +++ b/include/linux/syscall_intercept.h
>> @@ -0,0 +1,70 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (C) 2020 Collabora Ltd.
>> + */
>> +#ifndef _SYSCALL_INTERCEPT_H
>> +#define _SYSCALL_INTERCEPT_H
>> +
>> +#include <linux/sched.h>
>> +#include <linux/sched/signal.h>
>> +#include <linux/thread_info.h>
>> +
>> +#define SYSINT_SECCOMP		0x1
>
> <bikeshed>
>
> Can we maybe use a better name for this? I noone minds the extra
> characters I'd suggest:
> SYSCALL_INTERCEPT_SECCOMP
> or
> SYS_INTERCEPT_SECCOMP
>
> </bikeshed>
>

will do.

Thanks,
Peter Zijlstra Sept. 11, 2020, 9:32 a.m. UTC | #3
On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
> +static inline void __set_tsk_syscall_intercept(struct task_struct *tsk,
> +					   unsigned int type)
> +{
> +	tsk->syscall_intercept |= type;
> +
> +	if (tsk->syscall_intercept)
> +		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
> +}

Did the above want to be:

	unsigned int old = tsk->syscall_intercept;
	tsk->syscall_intercept |= type;
	if (!old)
		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT)

?

> +static inline void __clear_tsk_syscall_intercept(struct task_struct *tsk,
> +					     unsigned int type)
> +{
> +	tsk->syscall_intercept &= ~type;
> +
> +	if (tsk->syscall_intercept == 0)
> +		clear_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
> +}
Gabriel Krisman Bertazi Sept. 11, 2020, 8:08 p.m. UTC | #4
peterz@infradead.org writes:

> On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
>> +static inline void __set_tsk_syscall_intercept(struct task_struct *tsk,
>> +					   unsigned int type)
>> +{
>> +	tsk->syscall_intercept |= type;
>> +
>> +	if (tsk->syscall_intercept)
>> +		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
>> +}
>
> Did the above want to be:
>
> 	unsigned int old = tsk->syscall_intercept;
> 	tsk->syscall_intercept |= type;
> 	if (!old)
> 		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT)
>

Hi Peter,

Thanks for the review!

I'm not sure this change gains us anything.  For now,
__set_tsk_syscall_intercept cannot be called with !type, so both
versions behave the same, but my version is safe with that scenario.
This won't be called frequent enough for the extra calls to
set_tsk_thread_flag matter.  Am I missing something?

Thanks,
Kees Cook Sept. 22, 2020, 7:42 p.m. UTC | #5
On Tue, Sep 08, 2020 at 12:59:49AM -0400, Gabriel Krisman Bertazi wrote:
> Christian Brauner <christian.brauner@ubuntu.com> writes:
> 
> > On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
> >> index afe01e232935..3511c98a7849 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -959,7 +959,11 @@ struct task_struct {
> >>  	kuid_t				loginuid;
> >>  	unsigned int			sessionid;
> >>  #endif
> >> -	struct seccomp			seccomp;
> >> +
> >> +	struct {
> >> +		unsigned int			syscall_intercept;
> >> +		struct seccomp			seccomp;
> >> +	};
> >
> > If there's no specific reason to do this I'd not wrap this in an
> > anonymous struct. It doesn't really buy anything and there doesn't seem
> > to be  precedent in struct task_struct right now. Also, if this somehow
> > adds padding it seems you might end up increasing the size of struct
> > task_struct more than necessary by accident? (I might be wrong
> > though.)
> 
> Hi Christian,
> 
> Thanks for your review on this and on the other patches of this series.
> 
> I wrapped these to prevent struct layout randomization from separating
> the flags field from seccomp, as they are going to be used together and
> I was trying to reduce overhead to seccomp entry due to two cache misses
> when reading this structure.  Measuring it seccomp_benchmark didn't show
> any difference with the unwrapped version, so perhaps it was a bit of
> premature optimization?

That should not be a thing to think about here. Structure randomization
already has a mode to protect against cache line issues. I would leave
this as just a new member; no wrapping struct.
Kees Cook Sept. 22, 2020, 7:44 p.m. UTC | #6
On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
> Convert TIF_SECCOMP into a generic TI flag for any syscall interception
> work being done by the kernel.  The actual type of work is exposed by a
> new flag field outside of thread_info.  This ensures that the
> syscall_intercept field is only accessed if struct seccomp has to be
> accessed already, such that it doesn't incur in a much higher cost to
> the seccomp path.
> 
> In order to avoid modifying every architecture at once, this patch has a
> transition mechanism, such that architectures that define TIF_SECCOMP
> continue to work by ignoring the syscall_intercept flag, as long as they
> don't support other syscall interception mechanisms like the future
> syscall user dispatch.  When migrating TIF_SECCOMP to
> TIF_SYSCALL_INTERCEPT, they should adopt the semantics of checking the
> syscall_intercept flag, like it is done in the common entry syscall
> code, or even better, migrate to the common syscall entry code.

Can we "eat" all the other flags like ptrace, audit, etc, too? Doing
this only for seccomp seems strange.
Gabriel Krisman Bertazi Sept. 23, 2020, 8:18 p.m. UTC | #7
Kees Cook <keescook@chromium.org> writes:

> On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
>> Convert TIF_SECCOMP into a generic TI flag for any syscall interception
>> work being done by the kernel.  The actual type of work is exposed by a
>> new flag field outside of thread_info.  This ensures that the
>> syscall_intercept field is only accessed if struct seccomp has to be
>> accessed already, such that it doesn't incur in a much higher cost to
>> the seccomp path.
>> 
>> In order to avoid modifying every architecture at once, this patch has a
>> transition mechanism, such that architectures that define TIF_SECCOMP
>> continue to work by ignoring the syscall_intercept flag, as long as they
>> don't support other syscall interception mechanisms like the future
>> syscall user dispatch.  When migrating TIF_SECCOMP to
>> TIF_SYSCALL_INTERCEPT, they should adopt the semantics of checking the
>> syscall_intercept flag, like it is done in the common entry syscall
>> code, or even better, migrate to the common syscall entry code.
>
> Can we "eat" all the other flags like ptrace, audit, etc, too? Doing
> this only for seccomp seems strange.

Hi Kees, Thanks again for the review.

Yes, we can, and I'm happy to follow up with that as part of my TIF
clean up work, but can we not block the current patchset to be merged
waiting for that, as this already grew a lot from the original feature
submission?

Also, Thomas do you mind this approach to reduce the usage of TIF_
flags?  I remember you suggested simply expanding the variable to 64
bits at some point.

Thanks,
Gabriel Krisman Bertazi Sept. 23, 2020, 8:28 p.m. UTC | #8
Kees Cook <keescook@chromium.org> writes:

> On Tue, Sep 08, 2020 at 12:59:49AM -0400, Gabriel Krisman Bertazi wrote:
>> Christian Brauner <christian.brauner@ubuntu.com> writes:
>> 
>> > On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
>> >> index afe01e232935..3511c98a7849 100644
>> >> --- a/include/linux/sched.h
>> >> +++ b/include/linux/sched.h
>> >> @@ -959,7 +959,11 @@ struct task_struct {
>> >>  	kuid_t				loginuid;
>> >>  	unsigned int			sessionid;
>> >>  #endif
>> >> -	struct seccomp			seccomp;
>> >> +
>> >> +	struct {
>> >> +		unsigned int			syscall_intercept;
>> >> +		struct seccomp			seccomp;
>> >> +	};
>> >
>> > If there's no specific reason to do this I'd not wrap this in an
>> > anonymous struct. It doesn't really buy anything and there doesn't seem
>> > to be  precedent in struct task_struct right now. Also, if this somehow
>> > adds padding it seems you might end up increasing the size of struct
>> > task_struct more than necessary by accident? (I might be wrong
>> > though.)
>> 
>> Hi Christian,
>> 
>> Thanks for your review on this and on the other patches of this series.
>> 
>> I wrapped these to prevent struct layout randomization from separating
>> the flags field from seccomp, as they are going to be used together and
>> I was trying to reduce overhead to seccomp entry due to two cache misses
>> when reading this structure.  Measuring it seccomp_benchmark didn't show
>> any difference with the unwrapped version, so perhaps it was a bit of
>> premature optimization?
>
> That should not be a thing to think about here. Structure randomization
> already has a mode to protect against cache line issues. I would leave
> this as just a new member; no wrapping struct.

Makes sense.  I will drop it for the next iteration.  Thanks!
Kees Cook Sept. 23, 2020, 8:49 p.m. UTC | #9
On Wed, Sep 23, 2020 at 04:18:26PM -0400, Gabriel Krisman Bertazi wrote:
> Kees Cook <keescook@chromium.org> writes:
> 
> > On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
> >> Convert TIF_SECCOMP into a generic TI flag for any syscall interception
> >> work being done by the kernel.  The actual type of work is exposed by a
> >> new flag field outside of thread_info.  This ensures that the
> >> syscall_intercept field is only accessed if struct seccomp has to be
> >> accessed already, such that it doesn't incur in a much higher cost to
> >> the seccomp path.
> >> 
> >> In order to avoid modifying every architecture at once, this patch has a
> >> transition mechanism, such that architectures that define TIF_SECCOMP
> >> continue to work by ignoring the syscall_intercept flag, as long as they
> >> don't support other syscall interception mechanisms like the future
> >> syscall user dispatch.  When migrating TIF_SECCOMP to
> >> TIF_SYSCALL_INTERCEPT, they should adopt the semantics of checking the
> >> syscall_intercept flag, like it is done in the common entry syscall
> >> code, or even better, migrate to the common syscall entry code.
> >
> > Can we "eat" all the other flags like ptrace, audit, etc, too? Doing
> > this only for seccomp seems strange.
> 
> Hi Kees, Thanks again for the review.
> 
> Yes, we can, and I'm happy to follow up with that as part of my TIF
> clean up work, but can we not block the current patchset to be merged
> waiting for that, as this already grew a lot from the original feature
> submission?

In that case, I'd say just add the new TIF flag. The consolidation can
come later.
Peter Zijlstra Sept. 24, 2020, 11:24 a.m. UTC | #10
On Fri, Sep 11, 2020 at 04:08:45PM -0400, Gabriel Krisman Bertazi wrote:
> peterz@infradead.org writes:
> 
> > On Fri, Sep 04, 2020 at 04:31:39PM -0400, Gabriel Krisman Bertazi wrote:
> >> +static inline void __set_tsk_syscall_intercept(struct task_struct *tsk,
> >> +					   unsigned int type)
> >> +{
> >> +	tsk->syscall_intercept |= type;
> >> +
> >> +	if (tsk->syscall_intercept)
> >> +		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
> >> +}
> >
> > Did the above want to be:
> >
> > 	unsigned int old = tsk->syscall_intercept;
> > 	tsk->syscall_intercept |= type;
> > 	if (!old)
> > 		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT)
> >
> 
> Hi Peter,
> 
> Thanks for the review!
> 
> I'm not sure this change gains us anything.  For now,
> __set_tsk_syscall_intercept cannot be called with !type, so both
> versions behave the same, but my version is safe with that scenario.
> This won't be called frequent enough for the extra calls to
> set_tsk_thread_flag matter.  Am I missing something?

Your version will do set_tsk_thread_flag() for every invocation
(assuming non-zero type). That's sub-optimal.
Thomas Gleixner Sept. 25, 2020, 8 a.m. UTC | #11
On Wed, Sep 23 2020 at 13:49, Kees Cook wrote:
> On Wed, Sep 23, 2020 at 04:18:26PM -0400, Gabriel Krisman Bertazi wrote:
>> Kees Cook <keescook@chromium.org> writes:
>> Yes, we can, and I'm happy to follow up with that as part of my TIF
>> clean up work, but can we not block the current patchset to be merged
>> waiting for that, as this already grew a lot from the original feature
>> submission?
>
> In that case, I'd say just add the new TIF flag. The consolidation can
> come later.

No. This is exactly the wrong order. Cleanup and consolidation have
precedence over features. I'm tired of 'we'll do that later' songs,
simply because in the very end I'm going to be the idiot who mops up the
resulting mess.

Thanks,

        tglx
Gabriel Krisman Bertazi Sept. 25, 2020, 4:15 p.m. UTC | #12
Thomas Gleixner <tglx@linutronix.de> writes:

> On Wed, Sep 23 2020 at 13:49, Kees Cook wrote:
>> On Wed, Sep 23, 2020 at 04:18:26PM -0400, Gabriel Krisman Bertazi wrote:
>>> Kees Cook <keescook@chromium.org> writes:
>>> Yes, we can, and I'm happy to follow up with that as part of my TIF
>>> clean up work, but can we not block the current patchset to be merged
>>> waiting for that, as this already grew a lot from the original feature
>>> submission?
>>
>> In that case, I'd say just add the new TIF flag. The consolidation can
>> come later.
>
> No. This is exactly the wrong order. Cleanup and consolidation have
> precedence over features. I'm tired of 'we'll do that later' songs,
> simply because in the very end I'm going to be the idiot who mops up the
> resulting mess.
>

No problem.  I will follow up with a patchset consolidating those flags
into this syscall_intercept interface I proposed.  I assume there is no
immediate concerns with the consolidation approach itself.
Kees Cook Sept. 25, 2020, 8:30 p.m. UTC | #13
On Fri, Sep 25, 2020 at 12:15:54PM -0400, Gabriel Krisman Bertazi wrote:
> Thomas Gleixner <tglx@linutronix.de> writes:
> 
> > On Wed, Sep 23 2020 at 13:49, Kees Cook wrote:
> >> On Wed, Sep 23, 2020 at 04:18:26PM -0400, Gabriel Krisman Bertazi wrote:
> >>> Kees Cook <keescook@chromium.org> writes:
> >>> Yes, we can, and I'm happy to follow up with that as part of my TIF
> >>> clean up work, but can we not block the current patchset to be merged
> >>> waiting for that, as this already grew a lot from the original feature
> >>> submission?
> >>
> >> In that case, I'd say just add the new TIF flag. The consolidation can
> >> come later.
> >
> > No. This is exactly the wrong order. Cleanup and consolidation have
> > precedence over features. I'm tired of 'we'll do that later' songs,
> > simply because in the very end I'm going to be the idiot who mops up the
> > resulting mess.
> >
> 
> No problem.  I will follow up with a patchset consolidating those flags
> into this syscall_intercept interface I proposed.  I assume there is no
> immediate concerns with the consolidation approach itself.

I think the only issue is just finding a clean way to set/unset the
flags safely/quickly (a lock seems too heavy to me).

Should thread_info hold an entire u32 for all intercept flags (then the
TIF_WORK tests is just a zero-test of the intercept u32 word)? Or should
there be a TIF_INTERCEPT and a totally separate u32 (e.g. in
task_struct) indicating which intercepts? (And if they're separate, how
do we atomically set/unset)

i.e.:

atomic_start
	toggle a per-intercept bit
	set TIF_INTERCEPT = !!(intercept word)
atomic_end
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index afe01e232935..3511c98a7849 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -959,7 +959,11 @@  struct task_struct {
 	kuid_t				loginuid;
 	unsigned int			sessionid;
 #endif
-	struct seccomp			seccomp;
+
+	struct {
+		unsigned int			syscall_intercept;
+		struct seccomp			seccomp;
+	};
 
 	/* Thread group tracking: */
 	u64				parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..027dc462cea9 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -20,6 +20,24 @@ 
 #include <linux/atomic.h>
 #include <asm/seccomp.h>
 
+/*
+ * Some transitional defines to avoid migrating every architecture code
+ * at once.
+ */
+
+#if defined(TIF_SECCOMP) && defined(TIF_SYSCALL_INTERCEPT)
+# error "TIF_SYSCALL_INTERCEPT and TIF_SECCOMP can't be defined at the same time"
+#endif
+
+/*
+ * If the arch has not transitioned to TIF_SYSCALL_INTERCEPT, this let
+ * seccomp work with these architectures, as long as no other syscall
+ * intercept features are meant to be supported.
+ */
+#ifdef TIF_SECCOMP
+# define TIF_SYSCALL_INTERCEPT TIF_SECCOMP
+#endif
+
 struct seccomp_filter;
 /**
  * struct seccomp - the state of a seccomp'ed process
@@ -42,7 +60,7 @@  struct seccomp {
 extern int __secure_computing(const struct seccomp_data *sd);
 static inline int secure_computing(void)
 {
-	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+	if (unlikely(test_thread_flag(TIF_SYSCALL_INTERCEPT)))
 		return  __secure_computing(NULL);
 	return 0;
 }
diff --git a/include/linux/syscall_intercept.h b/include/linux/syscall_intercept.h
new file mode 100644
index 000000000000..725d157699da
--- /dev/null
+++ b/include/linux/syscall_intercept.h
@@ -0,0 +1,70 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#ifndef _SYSCALL_INTERCEPT_H
+#define _SYSCALL_INTERCEPT_H
+
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/thread_info.h>
+
+#define SYSINT_SECCOMP		0x1
+
+#ifdef TIF_SYSCALL_INTERCEPT
+
+/* seccomp (at least) can modify TIF_SYSCALL_INTERCEPT from a different
+ * thread, which means it can race with itself or with
+ * syscall_user_dispatch. Therefore, TIF_SYSCALL_INTERCEPT and
+ * syscall_intercept are synchronized by tsk->sighand->siglock.
+ */
+
+static inline void __set_tsk_syscall_intercept(struct task_struct *tsk,
+					   unsigned int type)
+{
+	tsk->syscall_intercept |= type;
+
+	if (tsk->syscall_intercept)
+		set_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
+}
+
+static inline void __clear_tsk_syscall_intercept(struct task_struct *tsk,
+					     unsigned int type)
+{
+	tsk->syscall_intercept &= ~type;
+
+	if (tsk->syscall_intercept == 0)
+		clear_tsk_thread_flag(tsk, TIF_SYSCALL_INTERCEPT);
+}
+
+static inline void set_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
+{
+	spin_lock_irq(&tsk->sighand->siglock);
+	__set_tsk_syscall_intercept(tsk, type);
+	spin_unlock_irq(&tsk->sighand->siglock);
+}
+
+static inline void clear_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
+{
+	spin_lock_irq(&tsk->sighand->siglock);
+	__clear_tsk_syscall_intercept(tsk, type);
+	spin_unlock_irq(&tsk->sighand->siglock);
+}
+
+#else
+static inline void __set_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
+{
+}
+static inline void set_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
+{
+}
+static inline void __clear_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
+{
+}
+static inline void clear_tsk_syscall_intercept(struct task_struct *tsk, unsigned int type)
+{
+}
+#endif
+
+#endif
+
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d32190861bd..a39177bed8ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,7 +49,7 @@ 
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
-#include <linux/seccomp.h>
+#include <linux/syscall_intercept.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -898,6 +898,7 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	 * the usage counts on the error path calling free_task.
 	 */
 	tsk->seccomp.filter = NULL;
+	tsk->syscall_intercept = 0;
 #endif
 
 	setup_thread_stack(tsk, orig);
@@ -1620,9 +1621,14 @@  static void copy_seccomp(struct task_struct *p)
 	 * If the parent gained a seccomp mode after copying thread
 	 * flags and between before we held the sighand lock, we have
 	 * to manually enable the seccomp thread flag here.
+	 *
+	 * In addition current sighand lock is asserted, so it is safe
+	 * to use the unlocked version of set_tsk_syscall_intercept.
 	 */
 	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
-		set_tsk_thread_flag(p, TIF_SECCOMP);
+		__set_tsk_syscall_intercept(p, SYSINT_SECCOMP);
+	else
+		__clear_tsk_syscall_intercept(p, SYSINT_SECCOMP);
 #endif
 }
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3ee59ce0a323..d0643b500f2e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -28,6 +28,7 @@ 
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/sysctl.h>
+#include <linux/syscall_intercept.h>
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
 #include <asm/syscall.h>
@@ -352,14 +353,14 @@  static inline void seccomp_assign_mode(struct task_struct *task,
 
 	task->seccomp.mode = seccomp_mode;
 	/*
-	 * Make sure TIF_SECCOMP cannot be set before the mode (and
+	 * Make sure SYSINT_SECCOMP cannot be set before the mode (and
 	 * filter) is set.
 	 */
 	smp_mb__before_atomic();
 	/* Assume default seccomp processes want spec flaw mitigation. */
 	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
 		arch_seccomp_spec_mitigate(task);
-	set_tsk_thread_flag(task, TIF_SECCOMP);
+	__set_tsk_syscall_intercept(task, SYSINT_SECCOMP);
 }
 
 #ifdef CONFIG_SECCOMP_FILTER
@@ -925,7 +926,7 @@  static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	/*
 	 * Make sure that any changes to mode from another thread have
-	 * been seen after TIF_SECCOMP was seen.
+	 * been seen after SYSINT_SECCOMP was seen.
 	 */
 	rmb();