diff mbox series

[v3,seccomp,5/5] seccomp/cache: Report cache data through /proc/pid/seccomp_cache

Message ID d3d1c05ea0be2b192f480ec52ad64bffbb22dc9d.1601478774.git.yifeifz2@illinois.edu (mailing list archive)
State Not Applicable
Headers show
Series seccomp: Add bitmap cache of constant allow filter results | expand

Commit Message

YiFei Zhu Sept. 30, 2020, 3:19 p.m. UTC
From: YiFei Zhu <yifeifz2@illinois.edu>

Currently the kernel does not provide an infrastructure to translate
architecture numbers to a human-readable name. Translating syscall
numbers to syscall names is possible through FTRACE_SYSCALL
infrastructure but it does not provide support for compat syscalls.

This will create a file for each PID as /proc/pid/seccomp_cache.
The file will be empty when no seccomp filters are loaded, or be
in the format of:
<arch name> <decimal syscall number> <ALLOW | FILTER>
where ALLOW means the cache is guaranteed to allow the syscall,
and filter means the cache will pass the syscall to the BPF filter.

For the docker default profile on x86_64 it looks like:
x86_64 0 ALLOW
x86_64 1 ALLOW
x86_64 2 ALLOW
x86_64 3 ALLOW
[...]
x86_64 132 ALLOW
x86_64 133 ALLOW
x86_64 134 FILTER
x86_64 135 FILTER
x86_64 136 FILTER
x86_64 137 ALLOW
x86_64 138 ALLOW
x86_64 139 FILTER
x86_64 140 ALLOW
x86_64 141 ALLOW
[...]

This file is guarded by CONFIG_DEBUG_SECCOMP_CACHE with a default
of N because I think certain users of seccomp might not want the
application to know which syscalls are definitely usable. For
the same reason, it is also guarded by CAP_SYS_ADMIN.

Suggested-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
---
 arch/Kconfig                   | 15 +++++++++++
 arch/x86/include/asm/seccomp.h |  3 +++
 fs/proc/base.c                 |  3 +++
 include/linux/seccomp.h        |  5 ++++
 kernel/seccomp.c               | 46 ++++++++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+)

Comments

Jann Horn Sept. 30, 2020, 10 p.m. UTC | #1
On Wed, Sep 30, 2020 at 5:20 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> Currently the kernel does not provide an infrastructure to translate
> architecture numbers to a human-readable name. Translating syscall
> numbers to syscall names is possible through FTRACE_SYSCALL
> infrastructure but it does not provide support for compat syscalls.
>
> This will create a file for each PID as /proc/pid/seccomp_cache.
> The file will be empty when no seccomp filters are loaded, or be
> in the format of:
> <arch name> <decimal syscall number> <ALLOW | FILTER>
> where ALLOW means the cache is guaranteed to allow the syscall,
> and filter means the cache will pass the syscall to the BPF filter.
>
> For the docker default profile on x86_64 it looks like:
> x86_64 0 ALLOW
> x86_64 1 ALLOW
> x86_64 2 ALLOW
> x86_64 3 ALLOW
> [...]
> x86_64 132 ALLOW
> x86_64 133 ALLOW
> x86_64 134 FILTER
> x86_64 135 FILTER
> x86_64 136 FILTER
> x86_64 137 ALLOW
> x86_64 138 ALLOW
> x86_64 139 FILTER
> x86_64 140 ALLOW
> x86_64 141 ALLOW
> [...]

Oooh, neat! :) Thanks!

> Suggested-by: Jann Horn <jannh@google.com>
> Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
> ---
>  arch/Kconfig                   | 15 +++++++++++
>  arch/x86/include/asm/seccomp.h |  3 +++
>  fs/proc/base.c                 |  3 +++
>  include/linux/seccomp.h        |  5 ++++
>  kernel/seccomp.c               | 46 ++++++++++++++++++++++++++++++++++
>  5 files changed, 72 insertions(+)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index ca867b2a5d71..b840cadcc882 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -478,6 +478,7 @@ config HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
>           - all the requirements for HAVE_ARCH_SECCOMP_FILTER
>           - SECCOMP_ARCH_DEFAULT
>           - SECCOMP_ARCH_DEFAULT_NR
> +         - SECCOMP_ARCH_DEFAULT_NAME
>
>  config SECCOMP
>         prompt "Enable seccomp to safely execute untrusted bytecode"
> @@ -532,6 +533,20 @@ config SECCOMP_CACHE_NR_ONLY
>
>  endchoice
>
> +config DEBUG_SECCOMP_CACHE
> +       bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
> +       depends on SECCOMP_CACHE_NR_ONLY
> +       depends on PROC_FS
> +       help
> +         This is enables /proc/pid/seccomp_cache interface to monitor

nit: s/is enables/enables/

> +         seccomp cache data. The file format is subject to change. Reading
> +         the file requires CAP_SYS_ADMIN.
> +
> +         This option is for debugging only. Enabling present the risk that
> +         an adversary may be able to infer the seccomp filter logic.
> +
> +         If unsure, say N.
> +
[...]
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
[...]
> +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
> +                          struct pid *pid, struct task_struct *task)
> +{
> +       struct seccomp_filter *f;
> +
> +       /*
> +        * We don't want some sandboxed process know what their seccomp
> +        * filters consist of.
> +        */
> +       if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
> +               return -EACCES;
> +
> +       f = READ_ONCE(task->seccomp.filter);
> +       if (!f)
> +               return 0;

Hmm, this won't work, because the task could be exiting, and seccomp
filters are detached in release_task() (using
seccomp_filter_release()). And at the moment, seccomp_filter_release()
just locklessly NULLs out the tsk->seccomp.filter pointer and drops
the reference.

The locking here is kind of gross, but basically I think you can
change this code to use lock_task_sighand() / unlock_task_sighand()
(see the other examples in fs/proc/base.c), and bail out if
lock_task_sighand() returns NULL. And in seccomp_filter_release(), add
something like this:

/* We are effectively holding the siglock by not having any sighand. */
WARN_ON(tsk->sighand != NULL);

> +#ifdef SECCOMP_ARCH_DEFAULT
> +       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_DEFAULT_NAME,
> +                                   f->cache.syscall_allow_default,
> +                                   SECCOMP_ARCH_DEFAULT_NR);
> +#endif /* SECCOMP_ARCH_DEFAULT */
> +
> +#ifdef SECCOMP_ARCH_COMPAT
> +       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
> +                                   f->cache.syscall_allow_compat,
> +                                   SECCOMP_ARCH_COMPAT_NR);
> +#endif /* SECCOMP_ARCH_COMPAT */
> +       return 0;
> +}
> +#endif /* CONFIG_DEBUG_SECCOMP_CACHE */
> --
> 2.28.0
>
Kees Cook Sept. 30, 2020, 10:59 p.m. UTC | #2
On Wed, Sep 30, 2020 at 10:19:16AM -0500, YiFei Zhu wrote:
> From: YiFei Zhu <yifeifz2@illinois.edu>
> 
> Currently the kernel does not provide an infrastructure to translate
> architecture numbers to a human-readable name. Translating syscall
> numbers to syscall names is possible through FTRACE_SYSCALL
> infrastructure but it does not provide support for compat syscalls.
> 
> This will create a file for each PID as /proc/pid/seccomp_cache.
> The file will be empty when no seccomp filters are loaded, or be
> in the format of:
> <arch name> <decimal syscall number> <ALLOW | FILTER>
> where ALLOW means the cache is guaranteed to allow the syscall,
> and filter means the cache will pass the syscall to the BPF filter.
> 
> For the docker default profile on x86_64 it looks like:
> x86_64 0 ALLOW
> x86_64 1 ALLOW
> x86_64 2 ALLOW
> x86_64 3 ALLOW
> [...]
> x86_64 132 ALLOW
> x86_64 133 ALLOW
> x86_64 134 FILTER
> x86_64 135 FILTER
> x86_64 136 FILTER
> x86_64 137 ALLOW
> x86_64 138 ALLOW
> x86_64 139 FILTER
> x86_64 140 ALLOW
> x86_64 141 ALLOW
> [...]
> 
> This file is guarded by CONFIG_DEBUG_SECCOMP_CACHE with a default
> of N because I think certain users of seccomp might not want the
> application to know which syscalls are definitely usable. For
> the same reason, it is also guarded by CAP_SYS_ADMIN.
> 
> Suggested-by: Jann Horn <jannh@google.com>
> Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
> ---
>  arch/Kconfig                   | 15 +++++++++++
>  arch/x86/include/asm/seccomp.h |  3 +++
>  fs/proc/base.c                 |  3 +++
>  include/linux/seccomp.h        |  5 ++++
>  kernel/seccomp.c               | 46 ++++++++++++++++++++++++++++++++++
>  5 files changed, 72 insertions(+)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index ca867b2a5d71..b840cadcc882 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -478,6 +478,7 @@ config HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
>  	  - all the requirements for HAVE_ARCH_SECCOMP_FILTER
>  	  - SECCOMP_ARCH_DEFAULT
>  	  - SECCOMP_ARCH_DEFAULT_NR
> +	  - SECCOMP_ARCH_DEFAULT_NAME
>  
>  config SECCOMP
>  	prompt "Enable seccomp to safely execute untrusted bytecode"
> @@ -532,6 +533,20 @@ config SECCOMP_CACHE_NR_ONLY
>  
>  endchoice
>  
> +config DEBUG_SECCOMP_CACHE

naming nit: I prefer where what how order, so SECCOMP_CACHE_DEBUG.

> +	bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
> +	depends on SECCOMP_CACHE_NR_ONLY
> +	depends on PROC_FS
> +	help
> +	  This is enables /proc/pid/seccomp_cache interface to monitor
> +	  seccomp cache data. The file format is subject to change. Reading
> +	  the file requires CAP_SYS_ADMIN.
> +
> +	  This option is for debugging only. Enabling present the risk that
> +	  an adversary may be able to infer the seccomp filter logic.
> +
> +	  If unsure, say N.
> +
>  config HAVE_ARCH_STACKLEAK
>  	bool
>  	help
> diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
> index 7b3a58271656..33ccc074be7a 100644
> --- a/arch/x86/include/asm/seccomp.h
> +++ b/arch/x86/include/asm/seccomp.h
> @@ -19,13 +19,16 @@
>  #ifdef CONFIG_X86_64
>  # define SECCOMP_ARCH_DEFAULT			AUDIT_ARCH_X86_64
>  # define SECCOMP_ARCH_DEFAULT_NR		NR_syscalls
> +# define SECCOMP_ARCH_DEFAULT_NAME		"x86_64"
>  # ifdef CONFIG_COMPAT
>  #  define SECCOMP_ARCH_COMPAT			AUDIT_ARCH_I386
>  #  define SECCOMP_ARCH_COMPAT_NR		IA32_NR_syscalls
> +#  define SECCOMP_ARCH_COMPAT_NAME		"x86_32"

I think this should be "ia32"? Is there a good definitive guide on this
naming convention?
Jann Horn Sept. 30, 2020, 11:08 p.m. UTC | #3
[adding x86 folks to enhance bikeshedding]

On Thu, Oct 1, 2020 at 12:59 AM Kees Cook <keescook@chromium.org> wrote:
> On Wed, Sep 30, 2020 at 10:19:16AM -0500, YiFei Zhu wrote:
> > From: YiFei Zhu <yifeifz2@illinois.edu>
> >
> > Currently the kernel does not provide an infrastructure to translate
> > architecture numbers to a human-readable name. Translating syscall
> > numbers to syscall names is possible through FTRACE_SYSCALL
> > infrastructure but it does not provide support for compat syscalls.
> >
> > This will create a file for each PID as /proc/pid/seccomp_cache.
> > The file will be empty when no seccomp filters are loaded, or be
> > in the format of:
> > <arch name> <decimal syscall number> <ALLOW | FILTER>
> > where ALLOW means the cache is guaranteed to allow the syscall,
> > and filter means the cache will pass the syscall to the BPF filter.
> >
> > For the docker default profile on x86_64 it looks like:
> > x86_64 0 ALLOW
> > x86_64 1 ALLOW
> > x86_64 2 ALLOW
> > x86_64 3 ALLOW
> > [...]
> > x86_64 132 ALLOW
> > x86_64 133 ALLOW
> > x86_64 134 FILTER
> > x86_64 135 FILTER
> > x86_64 136 FILTER
> > x86_64 137 ALLOW
> > x86_64 138 ALLOW
> > x86_64 139 FILTER
> > x86_64 140 ALLOW
> > x86_64 141 ALLOW
[...]
> > diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
> > index 7b3a58271656..33ccc074be7a 100644
> > --- a/arch/x86/include/asm/seccomp.h
> > +++ b/arch/x86/include/asm/seccomp.h
> > @@ -19,13 +19,16 @@
> >  #ifdef CONFIG_X86_64
> >  # define SECCOMP_ARCH_DEFAULT                        AUDIT_ARCH_X86_64
> >  # define SECCOMP_ARCH_DEFAULT_NR             NR_syscalls
> > +# define SECCOMP_ARCH_DEFAULT_NAME           "x86_64"
> >  # ifdef CONFIG_COMPAT
> >  #  define SECCOMP_ARCH_COMPAT                        AUDIT_ARCH_I386
> >  #  define SECCOMP_ARCH_COMPAT_NR             IA32_NR_syscalls
> > +#  define SECCOMP_ARCH_COMPAT_NAME           "x86_32"
>
> I think this should be "ia32"? Is there a good definitive guide on this
> naming convention?

"man 2 syscall" calls them "x86-64" and "i386". The syscall table
files use ABI names "i386" and "64". The syscall stub prefixes use
"x64" and "ia32".

I don't think we have a good consistent naming strategy here. :P
Kees Cook Sept. 30, 2020, 11:12 p.m. UTC | #4
On Thu, Oct 01, 2020 at 12:00:46AM +0200, Jann Horn wrote:
> On Wed, Sep 30, 2020 at 5:20 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> [...]
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> [...]
> > +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
> > +                          struct pid *pid, struct task_struct *task)
> > +{
> > +       struct seccomp_filter *f;
> > +
> > +       /*
> > +        * We don't want some sandboxed process know what their seccomp
> > +        * filters consist of.
> > +        */
> > +       if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
> > +               return -EACCES;
> > +
> > +       f = READ_ONCE(task->seccomp.filter);
> > +       if (!f)
> > +               return 0;
> 
> Hmm, this won't work, because the task could be exiting, and seccomp
> filters are detached in release_task() (using
> seccomp_filter_release()). And at the moment, seccomp_filter_release()
> just locklessly NULLs out the tsk->seccomp.filter pointer and drops
> the reference.

Oh nice catch. Yeah, this would only happen if it was the only filter
remaining on a process with no children, etc.

> 
> The locking here is kind of gross, but basically I think you can
> change this code to use lock_task_sighand() / unlock_task_sighand()
> (see the other examples in fs/proc/base.c), and bail out if
> lock_task_sighand() returns NULL. And in seccomp_filter_release(), add
> something like this:
> 
> /* We are effectively holding the siglock by not having any sighand. */
> WARN_ON(tsk->sighand != NULL);

Yeah, good idea.
Kees Cook Sept. 30, 2020, 11:21 p.m. UTC | #5
On Thu, Oct 01, 2020 at 01:08:04AM +0200, Jann Horn wrote:
> [adding x86 folks to enhance bikeshedding]
> 
> On Thu, Oct 1, 2020 at 12:59 AM Kees Cook <keescook@chromium.org> wrote:
> > On Wed, Sep 30, 2020 at 10:19:16AM -0500, YiFei Zhu wrote:
> > > From: YiFei Zhu <yifeifz2@illinois.edu>
> > >
> > > Currently the kernel does not provide an infrastructure to translate
> > > architecture numbers to a human-readable name. Translating syscall
> > > numbers to syscall names is possible through FTRACE_SYSCALL
> > > infrastructure but it does not provide support for compat syscalls.
> > >
> > > This will create a file for each PID as /proc/pid/seccomp_cache.
> > > The file will be empty when no seccomp filters are loaded, or be
> > > in the format of:
> > > <arch name> <decimal syscall number> <ALLOW | FILTER>
> > > where ALLOW means the cache is guaranteed to allow the syscall,
> > > and filter means the cache will pass the syscall to the BPF filter.
> > >
> > > For the docker default profile on x86_64 it looks like:
> > > x86_64 0 ALLOW
> > > x86_64 1 ALLOW
> > > x86_64 2 ALLOW
> > > x86_64 3 ALLOW
> > > [...]
> > > x86_64 132 ALLOW
> > > x86_64 133 ALLOW
> > > x86_64 134 FILTER
> > > x86_64 135 FILTER
> > > x86_64 136 FILTER
> > > x86_64 137 ALLOW
> > > x86_64 138 ALLOW
> > > x86_64 139 FILTER
> > > x86_64 140 ALLOW
> > > x86_64 141 ALLOW
> [...]
> > > diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
> > > index 7b3a58271656..33ccc074be7a 100644
> > > --- a/arch/x86/include/asm/seccomp.h
> > > +++ b/arch/x86/include/asm/seccomp.h
> > > @@ -19,13 +19,16 @@
> > >  #ifdef CONFIG_X86_64
> > >  # define SECCOMP_ARCH_DEFAULT                        AUDIT_ARCH_X86_64
> > >  # define SECCOMP_ARCH_DEFAULT_NR             NR_syscalls
> > > +# define SECCOMP_ARCH_DEFAULT_NAME           "x86_64"
> > >  # ifdef CONFIG_COMPAT
> > >  #  define SECCOMP_ARCH_COMPAT                        AUDIT_ARCH_I386
> > >  #  define SECCOMP_ARCH_COMPAT_NR             IA32_NR_syscalls
> > > +#  define SECCOMP_ARCH_COMPAT_NAME           "x86_32"
> >
> > I think this should be "ia32"? Is there a good definitive guide on this
> > naming convention?
> 
> "man 2 syscall" calls them "x86-64" and "i386". The syscall table
> files use ABI names "i386" and "64". The syscall stub prefixes use
> "x64" and "ia32".
> 
> I don't think we have a good consistent naming strategy here. :P

Agreed. And with "i386" being so hopelessly inaccurate, I prefer
"ia32" ... *shrug*

I would hope we don't have to be super-pedantic and call them "x86-64" and "IA-32". :P
YiFei Zhu Oct. 1, 2020, 12:06 p.m. UTC | #6
On Wed, Sep 30, 2020 at 5:01 PM Jann Horn <jannh@google.com> wrote:
> Hmm, this won't work, because the task could be exiting, and seccomp
> filters are detached in release_task() (using
> seccomp_filter_release()). And at the moment, seccomp_filter_release()
> just locklessly NULLs out the tsk->seccomp.filter pointer and drops
> the reference.
>
> The locking here is kind of gross, but basically I think you can
> change this code to use lock_task_sighand() / unlock_task_sighand()
> (see the other examples in fs/proc/base.c), and bail out if
> lock_task_sighand() returns NULL. And in seccomp_filter_release(), add
> something like this:
>
> /* We are effectively holding the siglock by not having any sighand. */
> WARN_ON(tsk->sighand != NULL);

Ah thanks. I was thinking about how tasks exit and get freed and that
sort of stuff, and how this would race against them. The last time I
worked with procfs there was some magic going on that I could not
figure out, so I was thinking if some magic will stop the task_struct
from being released, considering it's an argument here.

I just looked at release_task and related functions; looks like it
will, at the end, decrease the reference count of the task_struct.
Does procfs increase the refcount while calling the procfs functions?
Hence, in procfs functions one can rely on the task_struct still being
a task_struct, but any direct effects of release_task may happen while
the procfs functions are running?

YiFei Zhu
Jann Horn Oct. 1, 2020, 4:05 p.m. UTC | #7
On Thu, Oct 1, 2020 at 2:06 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> On Wed, Sep 30, 2020 at 5:01 PM Jann Horn <jannh@google.com> wrote:
> > Hmm, this won't work, because the task could be exiting, and seccomp
> > filters are detached in release_task() (using
> > seccomp_filter_release()). And at the moment, seccomp_filter_release()
> > just locklessly NULLs out the tsk->seccomp.filter pointer and drops
> > the reference.
> >
> > The locking here is kind of gross, but basically I think you can
> > change this code to use lock_task_sighand() / unlock_task_sighand()
> > (see the other examples in fs/proc/base.c), and bail out if
> > lock_task_sighand() returns NULL. And in seccomp_filter_release(), add
> > something like this:
> >
> > /* We are effectively holding the siglock by not having any sighand. */
> > WARN_ON(tsk->sighand != NULL);
>
> Ah thanks. I was thinking about how tasks exit and get freed and that
> sort of stuff, and how this would race against them. The last time I
> worked with procfs there was some magic going on that I could not
> figure out, so I was thinking if some magic will stop the task_struct
> from being released, considering it's an argument here.
>
> I just looked at release_task and related functions; looks like it
> will, at the end, decrease the reference count of the task_struct.
> Does procfs increase the refcount while calling the procfs functions?
> Hence, in procfs functions one can rely on the task_struct still being
> a task_struct, but any direct effects of release_task may happen while
> the procfs functions are running?

Yeah.

The ONE() entry you're adding to tgid_base_stuff is used to help
instantiate a "struct inode" when someone looks up the path
"/proc/$tgid/seccomp_cache"; then when that path is opened, a "struct
file" is created that holds a reference to the inode; and while that
file exists, your proc_pid_seccomp_cache() can be invoked.

proc_pid_seccomp_cache() is invoked from proc_single_show()
("PROC_I(inode)->op.proc_show" is proc_pid_seccomp_cache), and
proc_single_show() obtains a temporary reference to the task_struct
using get_pid_task() on a "struct pid" and drops that reference
afterwards with put_task_struct(). The "struct pid" is obtained from
the "struct proc_inode", which is essentially a subclass of "struct
inode". The "struct pid" is kept refererenced until the inode goes
away, via proc_pid_evict_inode(), called by proc_evict_inode().

By looking at put_task_struct() and its callees, you can figure out
which parts of the "struct task" are kept alive by the reference to
it.

By the way, maybe it'd make sense to add this to tid_base_stuff as
well? That should just be one extra line of code. Seccomp filters are
technically per-thread, so it would make sense to have them visible in
the per-thread subdirectories /proc/$pid/task/$tid/.
YiFei Zhu Oct. 1, 2020, 4:18 p.m. UTC | #8
On Thu, Oct 1, 2020 at 11:05 AM Jann Horn <jannh@google.com> wrote:
> Yeah.
>
> The ONE() entry you're adding to tgid_base_stuff is used to help
> instantiate a "struct inode" when someone looks up the path
> "/proc/$tgid/seccomp_cache"; then when that path is opened, a "struct
> file" is created that holds a reference to the inode; and while that
> file exists, your proc_pid_seccomp_cache() can be invoked.
>
> proc_pid_seccomp_cache() is invoked from proc_single_show()
> ("PROC_I(inode)->op.proc_show" is proc_pid_seccomp_cache), and
> proc_single_show() obtains a temporary reference to the task_struct
> using get_pid_task() on a "struct pid" and drops that reference
> afterwards with put_task_struct(). The "struct pid" is obtained from
> the "struct proc_inode", which is essentially a subclass of "struct
> inode". The "struct pid" is kept refererenced until the inode goes
> away, via proc_pid_evict_inode(), called by proc_evict_inode().
>
> By looking at put_task_struct() and its callees, you can figure out
> which parts of the "struct task" are kept alive by the reference to
> it.

Ah I see. Thanks for the explanation.

> By the way, maybe it'd make sense to add this to tid_base_stuff as
> well? That should just be one extra line of code. Seccomp filters are
> technically per-thread, so it would make sense to have them visible in
> the per-thread subdirectories /proc/$pid/task/$tid/.

Right. Will do.

YiFei Zhu
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index ca867b2a5d71..b840cadcc882 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -478,6 +478,7 @@  config HAVE_ARCH_SECCOMP_CACHE_NR_ONLY
 	  - all the requirements for HAVE_ARCH_SECCOMP_FILTER
 	  - SECCOMP_ARCH_DEFAULT
 	  - SECCOMP_ARCH_DEFAULT_NR
+	  - SECCOMP_ARCH_DEFAULT_NAME
 
 config SECCOMP
 	prompt "Enable seccomp to safely execute untrusted bytecode"
@@ -532,6 +533,20 @@  config SECCOMP_CACHE_NR_ONLY
 
 endchoice
 
+config DEBUG_SECCOMP_CACHE
+	bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+	depends on SECCOMP_CACHE_NR_ONLY
+	depends on PROC_FS
+	help
+	  This is enables /proc/pid/seccomp_cache interface to monitor
+	  seccomp cache data. The file format is subject to change. Reading
+	  the file requires CAP_SYS_ADMIN.
+
+	  This option is for debugging only. Enabling present the risk that
+	  an adversary may be able to infer the seccomp filter logic.
+
+	  If unsure, say N.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 7b3a58271656..33ccc074be7a 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -19,13 +19,16 @@ 
 #ifdef CONFIG_X86_64
 # define SECCOMP_ARCH_DEFAULT			AUDIT_ARCH_X86_64
 # define SECCOMP_ARCH_DEFAULT_NR		NR_syscalls
+# define SECCOMP_ARCH_DEFAULT_NAME		"x86_64"
 # ifdef CONFIG_COMPAT
 #  define SECCOMP_ARCH_COMPAT			AUDIT_ARCH_I386
 #  define SECCOMP_ARCH_COMPAT_NR		IA32_NR_syscalls
+#  define SECCOMP_ARCH_COMPAT_NAME		"x86_32"
 # endif
 #else /* !CONFIG_X86_64 */
 # define SECCOMP_ARCH_DEFAULT		AUDIT_ARCH_I386
 # define SECCOMP_ARCH_DEFAULT_NR	NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME		"x86_32"
 #endif
 
 #include <asm-generic/seccomp.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 617db4e0faa0..c60c5fce70fa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3258,6 +3258,9 @@  static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_DEBUG_SECCOMP_CACHE
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..c35430f5f553 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -121,4 +121,9 @@  static inline long seccomp_get_metadata(struct task_struct *task,
 	return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_DEBUG_SECCOMP_CACHE
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task);
+#endif
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bed3b2a7f6c8..c5ca5e30281b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2297,3 +2297,49 @@  static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_DEBUG_SECCOMP_CACHE
+/* Currently CONFIG_DEBUG_SECCOMP_CACHE implies CONFIG_SECCOMP_CACHE_NR_ONLY */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+					const void *bitmap, size_t bitmap_size)
+{
+	int nr;
+
+	for (nr = 0; nr < bitmap_size; nr++) {
+		bool cached = test_bit(nr, bitmap);
+		char *status = cached ? "ALLOW" : "FILTER";
+
+		seq_printf(m, "%s %d %s\n", name, nr, status);
+	}
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filter *f;
+
+	/*
+	 * We don't want some sandboxed process know what their seccomp
+	 * filters consist of.
+	 */
+	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+		return -EACCES;
+
+	f = READ_ONCE(task->seccomp.filter);
+	if (!f)
+		return 0;
+
+#ifdef SECCOMP_ARCH_DEFAULT
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_DEFAULT_NAME,
+				    f->cache.syscall_allow_default,
+				    SECCOMP_ARCH_DEFAULT_NR);
+#endif /* SECCOMP_ARCH_DEFAULT */
+
+#ifdef SECCOMP_ARCH_COMPAT
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+				    f->cache.syscall_allow_compat,
+				    SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+	return 0;
+}
+#endif /* CONFIG_DEBUG_SECCOMP_CACHE */