diff mbox series

[v2,2/4] capabilities: Add securebit to restrict userns caps

Message ID 20240609104355.442002-3-jcalmels@3xx0.net (mailing list archive)
State Not Applicable
Headers show
Series Introduce user namespace capabilities | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / test
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / veristat

Commit Message

Jonathan Calmels June 9, 2024, 10:43 a.m. UTC
This patch adds a new capability security bit designed to constrain a
task’s userns capability set to its bounding set. The reason for this is
twofold:

- This serves as a quick and easy way to lock down a set of capabilities
  for a task, thus ensuring that any namespace it creates will never be
  more privileged than itself is.
- This helps userspace transition to more secure defaults by not requiring
  specific logic for the userns capability set, or libcap support.

Example:

    # capsh --secbits=$((1 << 8)) --drop=cap_sys_rawio -- \
            -c 'unshare -r grep Cap /proc/self/status'
    CapInh: 0000000000000000
    CapPrm: 000001fffffdffff
    CapEff: 000001fffffdffff
    CapBnd: 000001fffffdffff
    CapAmb: 0000000000000000
    CapUNs: 000001fffffdffff

Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>
---
 include/linux/securebits.h      |  1 +
 include/uapi/linux/securebits.h | 11 ++++++++++-
 kernel/user_namespace.c         |  5 +++++
 3 files changed, 16 insertions(+), 1 deletion(-)

Comments

Serge E. Hallyn June 10, 2024, 2:33 a.m. UTC | #1
On Sun, Jun 09, 2024 at 03:43:35AM -0700, Jonathan Calmels wrote:
> This patch adds a new capability security bit designed to constrain a
> task’s userns capability set to its bounding set. The reason for this is
> twofold:
> 
> - This serves as a quick and easy way to lock down a set of capabilities
>   for a task, thus ensuring that any namespace it creates will never be
>   more privileged than itself is.
> - This helps userspace transition to more secure defaults by not requiring
>   specific logic for the userns capability set, or libcap support.
> 
> Example:
> 
>     # capsh --secbits=$((1 << 8)) --drop=cap_sys_rawio -- \
>             -c 'unshare -r grep Cap /proc/self/status'
>     CapInh: 0000000000000000
>     CapPrm: 000001fffffdffff
>     CapEff: 000001fffffdffff
>     CapBnd: 000001fffffdffff
>     CapAmb: 0000000000000000
>     CapUNs: 000001fffffdffff

But you are not (that I can see, in this or the previous patch)
keeping SECURE_USERNS_STRICT_CAPS in securebits on the next
level unshare.  Though I think it's ok, because by then both
cap_userns and cap_bset are reduced and cap_userns can't be
expanded.  (Sorry, just thinking aloud here)

> Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>
> ---
>  include/linux/securebits.h      |  1 +
>  include/uapi/linux/securebits.h | 11 ++++++++++-
>  kernel/user_namespace.c         |  5 +++++
>  3 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/securebits.h b/include/linux/securebits.h
> index 656528673983..5f9d85cd69c3 100644
> --- a/include/linux/securebits.h
> +++ b/include/linux/securebits.h
> @@ -5,4 +5,5 @@
>  #include <uapi/linux/securebits.h>
>  
>  #define issecure(X)		(issecure_mask(X) & current_cred_xxx(securebits))
> +#define iscredsecure(cred, X)	(issecure_mask(X) & cred->securebits)
>  #endif /* !_LINUX_SECUREBITS_H */
> diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
> index d6d98877ff1a..2da3f4be4531 100644
> --- a/include/uapi/linux/securebits.h
> +++ b/include/uapi/linux/securebits.h
> @@ -52,10 +52,19 @@
>  #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
>  			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
>  
> +/* When set, user namespace capabilities are restricted to their parent's bounding set. */
> +#define SECURE_USERNS_STRICT_CAPS			8
> +#define SECURE_USERNS_STRICT_CAPS_LOCKED		9  /* make bit-8 immutable */
> +
> +#define SECBIT_USERNS_STRICT_CAPS (issecure_mask(SECURE_USERNS_STRICT_CAPS))
> +#define SECBIT_USERNS_STRICT_CAPS_LOCKED \
> +			(issecure_mask(SECURE_USERNS_STRICT_CAPS_LOCKED))
> +
>  #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
>  				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
>  				 issecure_mask(SECURE_KEEP_CAPS) | \
> -				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
> +				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE) | \
> +				 issecure_mask(SECURE_USERNS_STRICT_CAPS))
>  #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
>  
>  #endif /* _UAPI_LINUX_SECUREBITS_H */
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 7e624607330b..53848e2b68cd 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -10,6 +10,7 @@
>  #include <linux/cred.h>
>  #include <linux/securebits.h>
>  #include <linux/security.h>
> +#include <linux/capability.h>
>  #include <linux/keyctl.h>
>  #include <linux/key-type.h>
>  #include <keys/user-type.h>
> @@ -42,6 +43,10 @@ static void dec_user_namespaces(struct ucounts *ucounts)
>  
>  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
>  {
> +	/* Limit userns capabilities to our parent's bounding set. */

In the case of userns_install(), it will be the target user namespace
creator's bounding set, right?  Not "our parent's"?

> +	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
> +		cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
> +
>  	/* Start with the capabilities defined in the userns set. */
>  	cred->cap_bset = cred->cap_userns;
>  	cred->cap_permitted = cred->cap_userns;
> -- 
> 2.45.2
Jonathan Calmels June 10, 2024, 9:46 a.m. UTC | #2
On Sun, Jun 09, 2024 at 09:33:01PM GMT, Serge E. Hallyn wrote:
> On Sun, Jun 09, 2024 at 03:43:35AM -0700, Jonathan Calmels wrote:
> > This patch adds a new capability security bit designed to constrain a
> > task’s userns capability set to its bounding set. The reason for this is
> > twofold:
> > 
> > - This serves as a quick and easy way to lock down a set of capabilities
> >   for a task, thus ensuring that any namespace it creates will never be
> >   more privileged than itself is.
> > - This helps userspace transition to more secure defaults by not requiring
> >   specific logic for the userns capability set, or libcap support.
> > 
> > Example:
> > 
> >     # capsh --secbits=$((1 << 8)) --drop=cap_sys_rawio -- \
> >             -c 'unshare -r grep Cap /proc/self/status'
> >     CapInh: 0000000000000000
> >     CapPrm: 000001fffffdffff
> >     CapEff: 000001fffffdffff
> >     CapBnd: 000001fffffdffff
> >     CapAmb: 0000000000000000
> >     CapUNs: 000001fffffdffff
> 
> But you are not (that I can see, in this or the previous patch)
> keeping SECURE_USERNS_STRICT_CAPS in securebits on the next
> level unshare.  Though I think it's ok, because by then both
> cap_userns and cap_bset are reduced and cap_userns can't be
> expanded.  (Sorry, just thinking aloud here)

Right this is safe to reset, but maybe we do keep it if the secbit is
locked? This is kind of a special case compared to the other bits.

> > +	/* Limit userns capabilities to our parent's bounding set. */
> 
> In the case of userns_install(), it will be the target user namespace
> creator's bounding set, right?  Not "our parent's"?

Good point, I should reword this comment.
Serge E. Hallyn June 10, 2024, 1:05 p.m. UTC | #3
On Mon, Jun 10, 2024 at 02:46:06AM -0700, Jonathan Calmels wrote:
> On Sun, Jun 09, 2024 at 09:33:01PM GMT, Serge E. Hallyn wrote:
> > On Sun, Jun 09, 2024 at 03:43:35AM -0700, Jonathan Calmels wrote:
> > > This patch adds a new capability security bit designed to constrain a
> > > task’s userns capability set to its bounding set. The reason for this is
> > > twofold:
> > > 
> > > - This serves as a quick and easy way to lock down a set of capabilities
> > >   for a task, thus ensuring that any namespace it creates will never be
> > >   more privileged than itself is.
> > > - This helps userspace transition to more secure defaults by not requiring
> > >   specific logic for the userns capability set, or libcap support.
> > > 
> > > Example:
> > > 
> > >     # capsh --secbits=$((1 << 8)) --drop=cap_sys_rawio -- \
> > >             -c 'unshare -r grep Cap /proc/self/status'
> > >     CapInh: 0000000000000000
> > >     CapPrm: 000001fffffdffff
> > >     CapEff: 000001fffffdffff
> > >     CapBnd: 000001fffffdffff
> > >     CapAmb: 0000000000000000
> > >     CapUNs: 000001fffffdffff
> > 
> > But you are not (that I can see, in this or the previous patch)
> > keeping SECURE_USERNS_STRICT_CAPS in securebits on the next
> > level unshare.  Though I think it's ok, because by then both
> > cap_userns and cap_bset are reduced and cap_userns can't be
> > expanded.  (Sorry, just thinking aloud here)
> 
> Right this is safe to reset, but maybe we do keep it if the secbit is
> locked? This is kind of a special case compared to the other bits.

I don't think it would be worth the extra complication in the
secbits code, and it's semantically very different from the
cap_userns.

> > > +	/* Limit userns capabilities to our parent's bounding set. */
> > 
> > In the case of userns_install(), it will be the target user namespace
> > creator's bounding set, right?  Not "our parent's"?
> 
> Good point, I should reword this comment.
Jarkko Sakkinen June 28, 2024, 2:43 p.m. UTC | #4
On Sun, 2024-06-09 at 03:43 -0700, Jonathan Calmels wrote:
> This patch adds a new capability security bit designed to constrain a


nit: if you think of it "This patch adds" could be just "add", right? :-)

Also name the exact thing/symbol/whatever here. This is not a HBO series.

> task’s userns capability set to its bounding set. The reason for this
> is
> twofold:
> 
> - This serves as a quick and easy way to lock down a set of
> capabilities
>   for a task, thus ensuring that any namespace it creates will never
> be
>   more privileged than itself is.
> - This helps userspace transition to more secure defaults by not
> requiring
>   specific logic for the userns capability set, or libcap support.
> 
> Example:
> 
>     # capsh --secbits=$((1 << 8)) --drop=cap_sys_rawio -- \
>             -c 'unshare -r grep Cap /proc/self/status'
>     CapInh: 0000000000000000
>     CapPrm: 000001fffffdffff
>     CapEff: 000001fffffdffff
>     CapBnd: 000001fffffdffff
>     CapAmb: 0000000000000000
>     CapUNs: 000001fffffdffff
> 
> Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>
> ---
>  include/linux/securebits.h      |  1 +
>  include/uapi/linux/securebits.h | 11 ++++++++++-
>  kernel/user_namespace.c         |  5 +++++
>  3 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/securebits.h b/include/linux/securebits.h
> index 656528673983..5f9d85cd69c3 100644
> --- a/include/linux/securebits.h
> +++ b/include/linux/securebits.h
> @@ -5,4 +5,5 @@
>  #include <uapi/linux/securebits.h>
>  
>  #define issecure(X)		(issecure_mask(X) &
> current_cred_xxx(securebits))
> +#define iscredsecure(cred, X)	(issecure_mask(X) & cred-
> >securebits)
>  #endif /* !_LINUX_SECUREBITS_H */
> diff --git a/include/uapi/linux/securebits.h
> b/include/uapi/linux/securebits.h
> index d6d98877ff1a..2da3f4be4531 100644
> --- a/include/uapi/linux/securebits.h
> +++ b/include/uapi/linux/securebits.h
> @@ -52,10 +52,19 @@
>  #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
>  			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_L
> OCKED))
>  
> +/* When set, user namespace capabilities are restricted to their
> parent's bounding set. */
> +#define SECURE_USERNS_STRICT_CAPS			8
> +#define SECURE_USERNS_STRICT_CAPS_LOCKED		9  /* make



> bit-8 immutable */
> +
> +#define SECBIT_USERNS_STRICT_CAPS
> (issecure_mask(SECURE_USERNS_STRICT_CAPS))
> +#define SECBIT_USERNS_STRICT_CAPS_LOCKED \
> +			(issecure_mask(SECURE_USERNS_STRICT_CAPS_LOC
> KED))
> +
>  #define
> SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
>  				
> issecure_mask(SECURE_NO_SETUID_FIXUP) | \
>  				 issecure_mask(SECURE_KEEP_CAPS) | \
> -				
> issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
> +				
> issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE) | \
> +				

spurious new lines in the diff

please as first priority aim for absolute minimal diff or at least
do grow diff proactively like this.

If we really think after that, that we need some "extras" to the
patch set, then we decide that. These only take energy away from
reviewers.


> issecure_mask(SECURE_USERNS_STRICT_CAPS))
>  #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
>  
>  #endif /* _UAPI_LINUX_SECUREBITS_H */
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 7e624607330b..53848e2b68cd 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -10,6 +10,7 @@
>  #include <linux/cred.h>
>  #include <linux/securebits.h>
>  #include <linux/security.h>
> +#include <linux/capability.h>
>  #include <linux/keyctl.h>
>  #include <linux/key-type.h>
>  #include <keys/user-type.h>
> @@ -42,6 +43,10 @@ static void dec_user_namespaces(struct ucounts
> *ucounts)
>  
>  static void set_cred_user_ns(struct cred *cred, struct
> user_namespace *user_ns)
>  {
> +	/* Limit userns capabilities to our parent's bounding set.
> */
> +	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
> +		cred->cap_userns = cap_intersect(cred->cap_userns,
> cred->cap_bset);
> +
>  	/* Start with the capabilities defined in the userns set. */
>  	cred->cap_bset = cred->cap_userns;
>  	cred->cap_permitted = cred->cap_userns;

Going for 4 week holiday starting for next week so focus in on nits
but since this is something to do access control:

1. Please go surgical with the diff's because this type of patches
also require a surgical review. Now reviewing this like riding on 
a bumpy road with a car of which suspension mechanics is broken
;-)

Hope you grab my argument here. I only want to look at the problem
and solution for that not random stuff..

BR, Jarkko
Jarkko Sakkinen June 28, 2024, 2:45 p.m. UTC | #5
On Fri, 2024-06-28 at 17:43 +0300, Jarkko Sakkinen wrote:
> On Sun, 2024-06-09 at 03:43 -0700, Jonathan Calmels wrote:
> > This patch adds a new capability security bit designed to constrain
> > a
> 
> 
> nit: if you think of it "This patch adds" could be just "add", right?
> :-)
> 
> Also name the exact thing/symbol/whatever here. This is not a HBO
> series.
> 
> > task’s userns capability set to its bounding set. The reason for
> > this
> > is
> > twofold:
> > 
> > - This serves as a quick and easy way to lock down a set of
> > capabilities
> >   for a task, thus ensuring that any namespace it creates will
> > never
> > be
> >   more privileged than itself is.
> > - This helps userspace transition to more secure defaults by not
> > requiring
> >   specific logic for the userns capability set, or libcap support.
> > 
> > Example:
> > 
> >     # capsh --secbits=$((1 << 8)) --drop=cap_sys_rawio -- \
> >             -c 'unshare -r grep Cap /proc/self/status'
> >     CapInh: 0000000000000000
> >     CapPrm: 000001fffffdffff
> >     CapEff: 000001fffffdffff
> >     CapBnd: 000001fffffdffff
> >     CapAmb: 0000000000000000
> >     CapUNs: 000001fffffdffff
> > 
> > Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>
> > ---
> >  include/linux/securebits.h      |  1 +
> >  include/uapi/linux/securebits.h | 11 ++++++++++-
> >  kernel/user_namespace.c         |  5 +++++
> >  3 files changed, 16 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/securebits.h
> > b/include/linux/securebits.h
> > index 656528673983..5f9d85cd69c3 100644
> > --- a/include/linux/securebits.h
> > +++ b/include/linux/securebits.h
> > @@ -5,4 +5,5 @@
> >  #include <uapi/linux/securebits.h>
> >  
> >  #define issecure(X)		(issecure_mask(X) &
> > current_cred_xxx(securebits))
> > +#define iscredsecure(cred, X)	(issecure_mask(X) & cred-
> > > securebits)
> >  #endif /* !_LINUX_SECUREBITS_H */
> > diff --git a/include/uapi/linux/securebits.h
> > b/include/uapi/linux/securebits.h
> > index d6d98877ff1a..2da3f4be4531 100644
> > --- a/include/uapi/linux/securebits.h
> > +++ b/include/uapi/linux/securebits.h
> > @@ -52,10 +52,19 @@
> >  #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
> >  			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE
> > _L
> > OCKED))
> >  
> > +/* When set, user namespace capabilities are restricted to their
> > parent's bounding set. */
> > +#define SECURE_USERNS_STRICT_CAPS			8
> > +#define SECURE_USERNS_STRICT_CAPS_LOCKED		9  /* make
> 
> 
> 
> > bit-8 immutable */
> > +
> > +#define SECBIT_USERNS_STRICT_CAPS
> > (issecure_mask(SECURE_USERNS_STRICT_CAPS))
> > +#define SECBIT_USERNS_STRICT_CAPS_LOCKED \
> > +			(issecure_mask(SECURE_USERNS_STRICT_CAPS_L
> > OC
> > KED))
> > +
> >  #define
> > SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
> >  				
> > issecure_mask(SECURE_NO_SETUID_FIXUP) | \
> >  				 issecure_mask(SECURE_KEEP_CAPS) |
> > \
> > -				
> > issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
> > +				
> > issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE) | \
> > +				
> 
> spurious new lines in the diff
> 
> please as first priority aim for absolute minimal diff or at least
> do grow diff proactively like this.
> 
> If we really think after that, that we need some "extras" to the
> patch set, then we decide that. These only take energy away from
> reviewers.
> 
> 
> > issecure_mask(SECURE_USERNS_STRICT_CAPS))
> >  #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
> >  
> >  #endif /* _UAPI_LINUX_SECUREBITS_H */
> > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> > index 7e624607330b..53848e2b68cd 100644
> > --- a/kernel/user_namespace.c
> > +++ b/kernel/user_namespace.c
> > @@ -10,6 +10,7 @@
> >  #include <linux/cred.h>
> >  #include <linux/securebits.h>
> >  #include <linux/security.h>
> > +#include <linux/capability.h>
> >  #include <linux/keyctl.h>
> >  #include <linux/key-type.h>
> >  #include <keys/user-type.h>
> > @@ -42,6 +43,10 @@ static void dec_user_namespaces(struct ucounts
> > *ucounts)
> >  
> >  static void set_cred_user_ns(struct cred *cred, struct
> > user_namespace *user_ns)
> >  {
> > +	/* Limit userns capabilities to our parent's bounding set.
> > */
> > +	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
> > +		cred->cap_userns = cap_intersect(cred->cap_userns,
> > cred->cap_bset);
> > +
> >  	/* Start with the capabilities defined in the userns set.
> > */
> >  	cred->cap_bset = cred->cap_userns;
> >  	cred->cap_permitted = cred->cap_userns;
> 
> Going for 4 week holiday starting for next week so focus in on nits
> but since this is something to do access control:
> 
> 1. Please go surgical with the diff's because this type of patches
> also require a surgical review. Now reviewing this like riding on 
> a bumpy road with a car of which suspension mechanics is broken
> ;-)
> 
> Hope you grab my argument here. I only want to look at the problem
> and solution for that not random stuff..

I skip the other patches because of my eager to get on holiday but
my instinct tells me that at least some of this feedback applies
to all of the patches.

So put your solution in sight, not clean ups.


BR, Jarkko
diff mbox series

Patch

diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 656528673983..5f9d85cd69c3 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -5,4 +5,5 @@ 
 #include <uapi/linux/securebits.h>
 
 #define issecure(X)		(issecure_mask(X) & current_cred_xxx(securebits))
+#define iscredsecure(cred, X)	(issecure_mask(X) & cred->securebits)
 #endif /* !_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index d6d98877ff1a..2da3f4be4531 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -52,10 +52,19 @@ 
 #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
 			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
 
+/* When set, user namespace capabilities are restricted to their parent's bounding set. */
+#define SECURE_USERNS_STRICT_CAPS			8
+#define SECURE_USERNS_STRICT_CAPS_LOCKED		9  /* make bit-8 immutable */
+
+#define SECBIT_USERNS_STRICT_CAPS (issecure_mask(SECURE_USERNS_STRICT_CAPS))
+#define SECBIT_USERNS_STRICT_CAPS_LOCKED \
+			(issecure_mask(SECURE_USERNS_STRICT_CAPS_LOCKED))
+
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
 				 issecure_mask(SECURE_KEEP_CAPS) | \
-				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE) | \
+				 issecure_mask(SECURE_USERNS_STRICT_CAPS))
 #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
 
 #endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7e624607330b..53848e2b68cd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -10,6 +10,7 @@ 
 #include <linux/cred.h>
 #include <linux/securebits.h>
 #include <linux/security.h>
+#include <linux/capability.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <keys/user-type.h>
@@ -42,6 +43,10 @@  static void dec_user_namespaces(struct ucounts *ucounts)
 
 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 {
+	/* Limit userns capabilities to our parent's bounding set. */
+	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
+		cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
+
 	/* Start with the capabilities defined in the userns set. */
 	cred->cap_bset = cred->cap_userns;
 	cred->cap_permitted = cred->cap_userns;