diff mbox series

[v3,seccomp,3/5] seccomp/cache: Lookup syscall allowlist for fast path

Message ID 83c72471f9f79fa982508bd4db472686a67b8320.1601478774.git.yifeifz2@illinois.edu (mailing list archive)
State Not Applicable
Headers show
Series seccomp: Add bitmap cache of constant allow filter results | expand

Commit Message

YiFei Zhu Sept. 30, 2020, 3:19 p.m. UTC
From: YiFei Zhu <yifeifz2@illinois.edu>

The fast (common) path for seccomp should be that the filter permits
the syscall to pass through, and failing seccomp is expected to be
an exceptional case; it is not expected for userspace to call a
denylisted syscall over and over.

This first finds the current allow bitmask by iterating through
syscall_arches[] array and comparing it to the one in struct
seccomp_data; this loop is expected to be unrolled. It then
does a test_bit against the bitmask. If the bit is set, then
there is no need to run the full filter; it returns
SECCOMP_RET_ALLOW immediately.

Co-developed-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
Signed-off-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
---
 kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

Comments

Kees Cook Sept. 30, 2020, 9:32 p.m. UTC | #1
On Wed, Sep 30, 2020 at 10:19:14AM -0500, YiFei Zhu wrote:
> From: YiFei Zhu <yifeifz2@illinois.edu>
> 
> The fast (common) path for seccomp should be that the filter permits
> the syscall to pass through, and failing seccomp is expected to be
> an exceptional case; it is not expected for userspace to call a
> denylisted syscall over and over.
> 
> This first finds the current allow bitmask by iterating through
> syscall_arches[] array and comparing it to the one in struct
> seccomp_data; this loop is expected to be unrolled. It then
> does a test_bit against the bitmask. If the bit is set, then
> there is no need to run the full filter; it returns
> SECCOMP_RET_ALLOW immediately.
> 
> Co-developed-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
> Signed-off-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>

I'd like the content/ordering of this and the emulator patch to be reorganized a bit.
I'd like to see the infrastructure of the cache added first (along with
the "always allow" test logic in this patch), with the emulator missing:
i.e. the patch is a logical no-op: no behavior changes because nothing
ever changes the cache bits, but all the operational logic, structure
changes, etc, is in place. Then the next patch would be replacing the
no-op with the emulator.

> ---
>  kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index f09c9e74ae05..bed3b2a7f6c8 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
>  static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
>  {
>  }
> +
> +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,

bikeshedding: "cache check" doesn't tell me anything about what it's
actually checking for. How about calling this seccomp_is_constant_allow() or
something that reflects both the "bool" return ("is") and what that bool
means ("should always be allowed").

> +				       const struct seccomp_data *sd)
> +{
> +	return false;
> +}
>  #endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
>  
>  /**
> @@ -331,6 +337,49 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
> +static bool seccomp_cache_check_bitmap(const void *bitmap, size_t bitmap_size,

Please also mark as "inline".

> +				       int syscall_nr)
> +{
> +	if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
> +		return false;
> +	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
> +
> +	return test_bit(syscall_nr, bitmap);
> +}
> +
> +/**
> + * seccomp_cache_check - lookup seccomp cache
> + * @sfilter: The seccomp filter
> + * @sd: The seccomp data to lookup the cache with
> + *
> + * Returns true if the seccomp_data is cached and allowed.
> + */
> +static bool seccomp_cache_check(const struct seccomp_filter *sfilter,

inline too.

> +				const struct seccomp_data *sd)
> +{
> +	int syscall_nr = sd->nr;
> +	const struct seccomp_cache_filter_data *cache = &sfilter->cache;
> +
> +#ifdef SECCOMP_ARCH_DEFAULT
> +	if (likely(sd->arch == SECCOMP_ARCH_DEFAULT))
> +		return seccomp_cache_check_bitmap(cache->syscall_allow_default,
> +						  SECCOMP_ARCH_DEFAULT_NR,
> +						  syscall_nr);
> +#endif /* SECCOMP_ARCH_DEFAULT */
> +
> +#ifdef SECCOMP_ARCH_COMPAT
> +	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
> +		return seccomp_cache_check_bitmap(cache->syscall_allow_compat,
> +						  SECCOMP_ARCH_COMPAT_NR,
> +						  syscall_nr);
> +#endif /* SECCOMP_ARCH_COMPAT */
> +
> +	WARN_ON_ONCE(true);
> +	return false;
> +}
> +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
> +
>  /**
>   * seccomp_run_filters - evaluates all seccomp filters against @sd
>   * @sd: optional seccomp data to be passed to filters
> @@ -353,6 +402,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
>  	if (WARN_ON(f == NULL))
>  		return SECCOMP_RET_KILL_PROCESS;
>  
> +	if (seccomp_cache_check(f, sd))
> +		return SECCOMP_RET_ALLOW;
> +
>  	/*
>  	 * All filters in the list are evaluated and the lowest BPF return
>  	 * value always takes priority (ignoring the DATA).
> -- 
> 2.28.0
> 

Otherwise, yup, looks good.
YiFei Zhu Oct. 9, 2020, 12:17 a.m. UTC | #2
On Wed, Sep 30, 2020 at 4:32 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Wed, Sep 30, 2020 at 10:19:14AM -0500, YiFei Zhu wrote:
> > From: YiFei Zhu <yifeifz2@illinois.edu>
> >
> > The fast (common) path for seccomp should be that the filter permits
> > the syscall to pass through, and failing seccomp is expected to be
> > an exceptional case; it is not expected for userspace to call a
> > denylisted syscall over and over.
> >
> > This first finds the current allow bitmask by iterating through
> > syscall_arches[] array and comparing it to the one in struct
> > seccomp_data; this loop is expected to be unrolled. It then
> > does a test_bit against the bitmask. If the bit is set, then
> > there is no need to run the full filter; it returns
> > SECCOMP_RET_ALLOW immediately.
> >
> > Co-developed-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
> > Signed-off-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
> > Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
>
> I'd like the content/ordering of this and the emulator patch to be reorganized a bit.
> I'd like to see the infrastructure of the cache added first (along with
> the "always allow" test logic in this patch), with the emulator missing:
> i.e. the patch is a logical no-op: no behavior changes because nothing
> ever changes the cache bits, but all the operational logic, structure
> changes, etc, is in place. Then the next patch would be replacing the
> no-op with the emulator.
>
> > ---
> >  kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 52 insertions(+)
> >
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index f09c9e74ae05..bed3b2a7f6c8 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
> >  static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
> >  {
> >  }
> > +
> > +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
>
> bikeshedding: "cache check" doesn't tell me anything about what it's
> actually checking for. How about calling this seccomp_is_constant_allow() or
> something that reflects both the "bool" return ("is") and what that bool
> means ("should always be allowed").

We have a naming conflict here. I'm about to rename
seccomp_emu_is_const_allow to seccomp_is_const_allow. Adding another
seccomp_is_constant_allow is confusing. Suggestions?

I think I would prefer to change seccomp_cache_check to
seccomp_cache_check_allow. While in this patch set seccomp_cache_check
does imply the filter is "constant" allow, argument-processing cache
may change this, and specifying an "allow" in the name specifies the
'what that bool means ("should always be allowed")'.

YiFei Zhu
Kees Cook Oct. 9, 2020, 5:35 a.m. UTC | #3
On Thu, Oct 08, 2020 at 07:17:39PM -0500, YiFei Zhu wrote:
> On Wed, Sep 30, 2020 at 4:32 PM Kees Cook <keescook@chromium.org> wrote:
> >
> > On Wed, Sep 30, 2020 at 10:19:14AM -0500, YiFei Zhu wrote:
> > > From: YiFei Zhu <yifeifz2@illinois.edu>
> > >
> > > The fast (common) path for seccomp should be that the filter permits
> > > the syscall to pass through, and failing seccomp is expected to be
> > > an exceptional case; it is not expected for userspace to call a
> > > denylisted syscall over and over.
> > >
> > > This first finds the current allow bitmask by iterating through
> > > syscall_arches[] array and comparing it to the one in struct
> > > seccomp_data; this loop is expected to be unrolled. It then
> > > does a test_bit against the bitmask. If the bit is set, then
> > > there is no need to run the full filter; it returns
> > > SECCOMP_RET_ALLOW immediately.
> > >
> > > Co-developed-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
> > > Signed-off-by: Dimitrios Skarlatos <dskarlat@cs.cmu.edu>
> > > Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
> >
> > I'd like the content/ordering of this and the emulator patch to be reorganized a bit.
> > I'd like to see the infrastructure of the cache added first (along with
> > the "always allow" test logic in this patch), with the emulator missing:
> > i.e. the patch is a logical no-op: no behavior changes because nothing
> > ever changes the cache bits, but all the operational logic, structure
> > changes, etc, is in place. Then the next patch would be replacing the
> > no-op with the emulator.
> >
> > > ---
> > >  kernel/seccomp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 52 insertions(+)
> > >
> > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > index f09c9e74ae05..bed3b2a7f6c8 100644
> > > --- a/kernel/seccomp.c
> > > +++ b/kernel/seccomp.c
> > > @@ -172,6 +172,12 @@ struct seccomp_cache_filter_data { };
> > >  static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
> > >  {
> > >  }
> > > +
> > > +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
> >
> > bikeshedding: "cache check" doesn't tell me anything about what it's
> > actually checking for. How about calling this seccomp_is_constant_allow() or
> > something that reflects both the "bool" return ("is") and what that bool
> > means ("should always be allowed").
> 
> We have a naming conflict here. I'm about to rename
> seccomp_emu_is_const_allow to seccomp_is_const_allow. Adding another
> seccomp_is_constant_allow is confusing. Suggestions?
> 
> I think I would prefer to change seccomp_cache_check to
> seccomp_cache_check_allow. While in this patch set seccomp_cache_check
> does imply the filter is "constant" allow, argument-processing cache
> may change this, and specifying an "allow" in the name specifies the
> 'what that bool means ("should always be allowed")'.

Yeah, that seems good.
diff mbox series

Patch

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f09c9e74ae05..bed3b2a7f6c8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -172,6 +172,12 @@  struct seccomp_cache_filter_data { };
 static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
 {
 }
+
+static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter,
+				       const struct seccomp_data *sd)
+{
+	return false;
+}
 #endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
 
 /**
@@ -331,6 +337,49 @@  static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 	return 0;
 }
 
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+static bool seccomp_cache_check_bitmap(const void *bitmap, size_t bitmap_size,
+				       int syscall_nr)
+{
+	if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+		return false;
+	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+	return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static bool seccomp_cache_check(const struct seccomp_filter *sfilter,
+				const struct seccomp_data *sd)
+{
+	int syscall_nr = sd->nr;
+	const struct seccomp_cache_filter_data *cache = &sfilter->cache;
+
+#ifdef SECCOMP_ARCH_DEFAULT
+	if (likely(sd->arch == SECCOMP_ARCH_DEFAULT))
+		return seccomp_cache_check_bitmap(cache->syscall_allow_default,
+						  SECCOMP_ARCH_DEFAULT_NR,
+						  syscall_nr);
+#endif /* SECCOMP_ARCH_DEFAULT */
+
+#ifdef SECCOMP_ARCH_COMPAT
+	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+		return seccomp_cache_check_bitmap(cache->syscall_allow_compat,
+						  SECCOMP_ARCH_COMPAT_NR,
+						  syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+	WARN_ON_ONCE(true);
+	return false;
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
 /**
  * seccomp_run_filters - evaluates all seccomp filters against @sd
  * @sd: optional seccomp data to be passed to filters
@@ -353,6 +402,9 @@  static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	if (WARN_ON(f == NULL))
 		return SECCOMP_RET_KILL_PROCESS;
 
+	if (seccomp_cache_check(f, sd))
+		return SECCOMP_RET_ALLOW;
+
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).