diff mbox series

[RFC,v1,4/4] Allow to change the user namespace in which user rlimits are counted

Message ID 2718f7b13189dfd159414efb68e3533552593140.1604335819.git.gladkov.alexey@gmail.com (mailing list archive)
State New, archived
Headers show
Series Per user namespace rlimits | expand

Commit Message

Alexey Gladkov Nov. 2, 2020, 4:50 p.m. UTC
Add a new prctl to change the user namespace in which the process
counter is located. A pointer to the user namespace is in cred struct
to be inherited by all child processes.

Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
---
 fs/exec.c                  |  2 +-
 fs/io-wq.c                 | 13 ++++++++-----
 fs/io-wq.h                 |  1 +
 fs/io_uring.c              |  1 +
 include/linux/cred.h       |  8 ++++++++
 include/uapi/linux/prctl.h |  5 +++++
 kernel/cred.c              | 35 +++++++++++++++++++++++++++++------
 kernel/exit.c              |  2 +-
 kernel/fork.c              |  4 ++--
 kernel/sys.c               | 22 +++++++++++++++++++++-
 kernel/user_namespace.c    |  3 +++
 11 files changed, 80 insertions(+), 16 deletions(-)

Comments

Jann Horn Nov. 2, 2020, 5:10 p.m. UTC | #1
On Mon, Nov 2, 2020 at 5:52 PM Alexey Gladkov <gladkov.alexey@gmail.com> wrote:
> Add a new prctl to change the user namespace in which the process
> counter is located. A pointer to the user namespace is in cred struct
> to be inherited by all child processes.
[...]
> +       case PR_SET_RLIMIT_USER_NAMESPACE:
> +               if (!capable(CAP_SYS_RESOURCE))
> +                       return -EPERM;
> +
> +               switch (arg2) {
> +               case PR_RLIMIT_BIND_GLOBAL_USERNS:
> +                       error = set_rlimit_ns(&init_user_ns);
> +                       break;
> +               case PR_RLIMIT_BIND_CURRENT_USERNS:
> +                       error = set_rlimit_ns(current_user_ns());
> +                       break;
> +               default:
> +                       error = -EINVAL;
> +               }
> +               break;

I don't see how this can work. capable() requires that
current_user_ns()==&init_user_ns, so you can't use this API to bind
rlimits to any other user namespace.

Fundamentally, if it requires CAP_SYS_RESOURCE, this probably can't be
done as an API that a process uses to change its own rlimit scope. In
that case I would implement this as part of clone3() instead of
prctl(). (Then init_user_ns can set it if the caller has
CAP_SYS_RESOURCE. If you want to have support for doing the same thing
with nested namespaces, you'd also need a flag that the first-level
clone3() can set on the namespace to say "further rlimit splitting
should be allowed".)

Or alternatively, we could say that CAP_SYS_RESOURCE doesn't matter,
and instead you're allowed to move the rlimit scope if your current
hard rlimit is INFINITY. That might make more sense? Maybe?
Alexey Gladkov Nov. 2, 2020, 5:30 p.m. UTC | #2
On Mon, Nov 02, 2020 at 06:10:06PM +0100, Jann Horn wrote:
> On Mon, Nov 2, 2020 at 5:52 PM Alexey Gladkov <gladkov.alexey@gmail.com> wrote:
> > Add a new prctl to change the user namespace in which the process
> > counter is located. A pointer to the user namespace is in cred struct
> > to be inherited by all child processes.
> [...]
> > +       case PR_SET_RLIMIT_USER_NAMESPACE:
> > +               if (!capable(CAP_SYS_RESOURCE))
> > +                       return -EPERM;
> > +
> > +               switch (arg2) {
> > +               case PR_RLIMIT_BIND_GLOBAL_USERNS:
> > +                       error = set_rlimit_ns(&init_user_ns);
> > +                       break;
> > +               case PR_RLIMIT_BIND_CURRENT_USERNS:
> > +                       error = set_rlimit_ns(current_user_ns());
> > +                       break;
> > +               default:
> > +                       error = -EINVAL;
> > +               }
> > +               break;
> 
> I don't see how this can work. capable() requires that
> current_user_ns()==&init_user_ns, so you can't use this API to bind
> rlimits to any other user namespace.
> 
> Fundamentally, if it requires CAP_SYS_RESOURCE, this probably can't be
> done as an API that a process uses to change its own rlimit scope. In
> that case I would implement this as part of clone3() instead of
> prctl(). (Then init_user_ns can set it if the caller has
> CAP_SYS_RESOURCE. If you want to have support for doing the same thing
> with nested namespaces, you'd also need a flag that the first-level
> clone3() can set on the namespace to say "further rlimit splitting
> should be allowed".)
> 
> Or alternatively, we could say that CAP_SYS_RESOURCE doesn't matter,
> and instead you're allowed to move the rlimit scope if your current
> hard rlimit is INFINITY. That might make more sense? Maybe?

I think you are right. CAP_SYS_RESOURCE is not needed here since you still
cannot exceed the rlimit in the parent user namespace.
Sargun Dhillon Nov. 4, 2020, 10:03 a.m. UTC | #3
On Mon, Nov 02, 2020 at 05:50:33PM +0100, Alexey Gladkov wrote:
> Add a new prctl to change the user namespace in which the process
> counter is located. A pointer to the user namespace is in cred struct
> to be inherited by all child processes.
> 
> Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> ---
>  fs/exec.c                  |  2 +-
>  fs/io-wq.c                 | 13 ++++++++-----
>  fs/io-wq.h                 |  1 +
>  fs/io_uring.c              |  1 +
>  include/linux/cred.h       |  8 ++++++++
>  include/uapi/linux/prctl.h |  5 +++++
>  kernel/cred.c              | 35 +++++++++++++++++++++++++++++------
>  kernel/exit.c              |  2 +-
>  kernel/fork.c              |  4 ++--
>  kernel/sys.c               | 22 +++++++++++++++++++++-
>  kernel/user_namespace.c    |  3 +++
>  11 files changed, 80 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/exec.c b/fs/exec.c
> index c45dfc716394..574b1381276c 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
>  		goto out_ret;
>  	}
>  
> -	processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
> +	processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
>  
>  	/*
>  	 * We move the actual failure in case of RLIMIT_NPROC excess from
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index c3b0843abc9b..19e43ec115cb 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -116,6 +116,7 @@ struct io_wq {
>  
>  	struct task_struct *manager;
>  	struct user_struct *user;
It seems like user would be unused here, and you could use creds->user?

> +	const struct cred *creds;
>  	refcount_t refs;
>  	struct completion done;
>  
> @@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
>  	if (worker->flags & IO_WORKER_F_RUNNING)
>  		atomic_dec(&acct->nr_running);
>  	if (!(worker->flags & IO_WORKER_F_BOUND))
> -		dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> +		dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
>  	worker->flags = 0;
>  	preempt_enable();
>  
> @@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
>  			worker->flags |= IO_WORKER_F_BOUND;
>  			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
>  			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
> -			dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> +			dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
>  		} else {
> -			if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> +			if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
>  				return;
>  			worker->flags &= ~IO_WORKER_F_BOUND;
>  			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
> @@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
>  	}
>  
>  	if (index == IO_WQ_ACCT_UNBOUND &&
> -	    !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> +	    !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
>  		kfree(worker);
>  		return false;
>  	}
> @@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
>  	if (free_worker)
>  		return true;
>  
> -	processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> +	processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
> +			UCOUNT_RLIMIT_NPROC);
>  
>  	if (processes >= acct->max_workers &&
>  	    !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
> @@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
>  
>  	/* caller must already hold a reference to this */
>  	wq->user = data->user;
> +	wq->creds = data->creds;
>  
>  	for_each_node(node) {
>  		struct io_wqe *wqe;
> diff --git a/fs/io-wq.h b/fs/io-wq.h
> index 071f1a997800..6acc3a04c38f 100644
> --- a/fs/io-wq.h
> +++ b/fs/io-wq.h
> @@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);
>  
>  struct io_wq_data {
>  	struct user_struct *user;
> +	const struct cred *creds;
>  
>  	io_wq_work_fn *do_work;
>  	free_work_fn *free_work;
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 493e5047e67c..e419923968b3 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
>  	int ret = 0;
>  
>  	data.user = ctx->user;
> +	data.creds = ctx->creds;
>  	data.free_work = io_free_work;
>  	data.do_work = io_wq_submit_work;
>  
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 18639c069263..43aee68d117f 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -144,6 +144,7 @@ struct cred {
>  #endif
>  	struct user_struct *user;	/* real user ID subscription */
>  	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
> +	struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
>  	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
>  	/* RCU deletion */
>  	union {
> @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
>  extern int set_create_files_as(struct cred *, struct inode *);
>  extern int cred_fscmp(const struct cred *, const struct cred *);
>  extern void __init cred_init(void);
> +extern int set_rlimit_ns(struct user_namespace *ns);
>  
>  /*
>   * check for validity of credentials
> @@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
>  
>  #define task_uid(task)		(task_cred_xxx((task), uid))
>  #define task_euid(task)		(task_cred_xxx((task), euid))
> +#define task_rlimit_ns(task)	(task_cred_xxx((task), rlimit_ns))
>  
>  #define current_cred_xxx(xxx)			\
>  ({						\
> @@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
>  extern struct user_namespace init_user_ns;
>  #ifdef CONFIG_USER_NS
>  #define current_user_ns()	(current_cred_xxx(user_ns))
> +#define current_rlimit_ns()	(current_cred_xxx(rlimit_ns))
>  #else
>  static inline struct user_namespace *current_user_ns(void)
>  {
>  	return &init_user_ns;
>  }
> +static inline struct user_namespace *current_rlimit_ns(void)
> +{
> +	return &init_user_ns;
> +}
>  #endif
>  
>  
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 07b4f8131e36..4f853f903415 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -238,4 +238,9 @@ struct prctl_mm_map {
>  #define PR_SET_IO_FLUSHER		57
>  #define PR_GET_IO_FLUSHER		58
>  
> +#define PR_SET_RLIMIT_USER_NAMESPACE	59
> +#define PR_GET_RLIMIT_USER_NAMESPACE	60
> +# define PR_RLIMIT_BIND_GLOBAL_USERNS	(1UL << 0)
> +# define PR_RLIMIT_BIND_CURRENT_USERNS	(1UL << 1)
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 748704db1f6b..7b90e1ef9c9a 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -59,6 +59,7 @@ struct cred init_cred = {
>  	.cap_bset		= CAP_FULL_SET,
>  	.user			= INIT_USER,
>  	.user_ns		= &init_user_ns,
> +	.rlimit_ns		= &init_user_ns,
>  	.group_info		= &init_groups,
>  };
>  
> @@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
>  		put_group_info(cred->group_info);
>  	free_uid(cred->user);
>  	put_user_ns(cred->user_ns);
> +	put_user_ns(cred->rlimit_ns);
>  	kmem_cache_free(cred_jar, cred);
>  }
>  
> @@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
>  	get_group_info(new->group_info);
>  	get_uid(new->user);
>  	get_user_ns(new->user_ns);
> +	get_user_ns(new->rlimit_ns);
>  
>  #ifdef CONFIG_KEYS
>  	key_get(new->session_keyring);
> @@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
>  #endif
>  		clone_flags & CLONE_THREAD
>  	    ) {
> -		if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
> +		if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
>  			return -EACCES;
>  		p->real_cred = get_cred(p->cred);
>  		get_cred(p->cred);
> @@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
>  	}
>  #endif
>  
> -	if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> +	if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
>  		return -EACCES;
>  	p->cred = p->real_cred = get_cred(new);
>  	alter_cred_subscribers(new, 2);
> @@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
>  	 * perhaps this limit is exceeded in the parent user namespace.
>  	 */
>  	alter_cred_subscribers(new, 2);
> -	if (new->user != old->user &&
> -	    !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> +	if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
> +	    !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
>  		task->flags |= PF_NPROC_UNS_EXCEEDED;
>  	rcu_assign_pointer(task->real_cred, new);
>  	rcu_assign_pointer(task->cred, new);
> -	if (new->user != old->user)
> -		dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> +	if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
> +		dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
>  	alter_cred_subscribers(old, -2);
>  
>  	/* send notifications */
> @@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
>  }
>  EXPORT_SYMBOL(set_create_files_as);
>  
> +/*
> + * Change the rlimit user namespace of the current task, replacing the existing
> + * one. If the given namespace is NULL, then initial user namespace will be
> + * used.
> + *
> + * Return: 0 on success; -errno on failure.
> + */
> +int set_rlimit_ns(struct user_namespace *ns)
> +{
> +	struct cred *new;
> +
> +	new = prepare_creds();
> +	if (!new)
> +		return -ENOMEM;
> +
> +	new->rlimit_ns = ns ? ns : &init_user_ns;
> +
> +	return commit_creds(new);
> +}
> +
>  #ifdef CONFIG_DEBUG_CREDENTIALS
>  
>  bool creds_are_invalid(const struct cred *cred)
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 5a0d7dd1ad64..998436d32373 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
>  	/* don't need to get the RCU readlock here - the process is dead and
>  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
>  	rcu_read_lock();
> -	dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
> +	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
>  	rcu_read_unlock();
>  
>  	cgroup_release(p);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d2b28634dc8f..43f3c54fe4c6 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
>  		current->flags &= ~PF_NPROC_UNS_EXCEEDED;
>  		goto bad_fork_free;
>  	}
> -	processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
> +	processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
>  			UCOUNT_RLIMIT_NPROC);
>  	if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
>  		if (p->real_cred->user != INIT_USER &&
> @@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
>  #endif
>  	delayacct_tsk_free(p);
>  bad_fork_cleanup_count:
> -	dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
> +	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
>  	exit_creds(p);
>  bad_fork_free:
>  	p->state = TASK_DEAD;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index db780ec32d86..917cbd7fc674 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -467,7 +467,7 @@ static int set_user(struct cred *new)
>  	if (!new_user)
>  		return -EAGAIN;
>  
> -	processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> +	processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
>  
>  	/*
>  	 * We don't fail in case of NPROC limit excess here because too many
> @@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  
>  		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
>  		break;
> +	case PR_SET_RLIMIT_USER_NAMESPACE:
> +		if (!capable(CAP_SYS_RESOURCE))
> +			return -EPERM;
Can you have CAP_SYS_RESOURCE in the init user ns while you're in a non-init 
user ns? Shouldn't that only matter if you want to set yourself to 
PR_RLIMIT_BIND_GLOBAL_USERNS anyways and you just want to make sure they have 
CAP_SYS_RESOURCE in the current namespace for PR_RLIMIT_BIND_CURRENT_USERNS?


> +
> +		switch (arg2) {
> +		case PR_RLIMIT_BIND_GLOBAL_USERNS:
> +			error = set_rlimit_ns(&init_user_ns);
> +			break;
> +		case PR_RLIMIT_BIND_CURRENT_USERNS:
> +			error = set_rlimit_ns(current_user_ns());
> +			break;
To some degree, this isn't so much "per user namespace rlimits", as much as it 
is hierarchical rlimits. I'm not going to nitpick about names, but it might be 
worth exploring. Especially because the way the patch is written, it would be 
easy to introduce a third user namespace for rlimits with different mappings.

The downside I see with a sysctl is that changing it midway through the user 
namespaces lifetime could be confusing, and difficult to get right.

> +		default:
> +			error = -EINVAL;
> +		}
> +		break;
> +	case PR_GET_RLIMIT_USER_NAMESPACE:
> +		error = current_rlimit_ns() == &init_user_ns
> +			? PR_RLIMIT_BIND_GLOBAL_USERNS
> +			: PR_RLIMIT_BIND_CURRENT_USERNS;
> +		break;
It would be nice to have this be a sysctl, so everyone who does
setns() would get the behaviour, and if a process wants to 

>  	default:
>  		error = -EINVAL;
>  		break;
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 87804e0371fe..346df35ceba9 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
>  #endif
>  	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
>  	cred->user_ns = user_ns;
> +
> +	cred->rlimit_ns = &init_user_ns;
>  }
>  
>  /*
> @@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
>  	for (i = 0; i < UCOUNT_COUNTS; i++) {
>  		ns->ucount_max[i] = INT_MAX;
>  	}
> +	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
>  	ns->ucounts = ucounts;
>  
>  	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
> -- 
> 2.25.4
>
Alexey Gladkov Nov. 4, 2020, 4:21 p.m. UTC | #4
On Wed, Nov 04, 2020 at 10:03:05AM +0000, Sargun Dhillon wrote:
> On Mon, Nov 02, 2020 at 05:50:33PM +0100, Alexey Gladkov wrote:
> > Add a new prctl to change the user namespace in which the process
> > counter is located. A pointer to the user namespace is in cred struct
> > to be inherited by all child processes.
> > 
> > Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> > ---
> >  fs/exec.c                  |  2 +-
> >  fs/io-wq.c                 | 13 ++++++++-----
> >  fs/io-wq.h                 |  1 +
> >  fs/io_uring.c              |  1 +
> >  include/linux/cred.h       |  8 ++++++++
> >  include/uapi/linux/prctl.h |  5 +++++
> >  kernel/cred.c              | 35 +++++++++++++++++++++++++++++------
> >  kernel/exit.c              |  2 +-
> >  kernel/fork.c              |  4 ++--
> >  kernel/sys.c               | 22 +++++++++++++++++++++-
> >  kernel/user_namespace.c    |  3 +++
> >  11 files changed, 80 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/exec.c b/fs/exec.c
> > index c45dfc716394..574b1381276c 100644
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
> >  		goto out_ret;
> >  	}
> >  
> > -	processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
> > +	processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
> >  
> >  	/*
> >  	 * We move the actual failure in case of RLIMIT_NPROC excess from
> > diff --git a/fs/io-wq.c b/fs/io-wq.c
> > index c3b0843abc9b..19e43ec115cb 100644
> > --- a/fs/io-wq.c
> > +++ b/fs/io-wq.c
> > @@ -116,6 +116,7 @@ struct io_wq {
> >  
> >  	struct task_struct *manager;
> >  	struct user_struct *user;
> It seems like user would be unused here, and you could use creds->user?

Yep. I think so.

> > +	const struct cred *creds;
> >  	refcount_t refs;
> >  	struct completion done;
> >  
> > @@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
> >  	if (worker->flags & IO_WORKER_F_RUNNING)
> >  		atomic_dec(&acct->nr_running);
> >  	if (!(worker->flags & IO_WORKER_F_BOUND))
> > -		dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > +		dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> >  	worker->flags = 0;
> >  	preempt_enable();
> >  
> > @@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
> >  			worker->flags |= IO_WORKER_F_BOUND;
> >  			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
> >  			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
> > -			dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > +			dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> >  		} else {
> > -			if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> > +			if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
> >  				return;
> >  			worker->flags &= ~IO_WORKER_F_BOUND;
> >  			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
> > @@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
> >  	}
> >  
> >  	if (index == IO_WQ_ACCT_UNBOUND &&
> > -	    !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> > +	    !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
> >  		kfree(worker);
> >  		return false;
> >  	}
> > @@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
> >  	if (free_worker)
> >  		return true;
> >  
> > -	processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
> > +	processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
> > +			UCOUNT_RLIMIT_NPROC);
> >  
> >  	if (processes >= acct->max_workers &&
> >  	    !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
> > @@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
> >  
> >  	/* caller must already hold a reference to this */
> >  	wq->user = data->user;
> > +	wq->creds = data->creds;
> >  
> >  	for_each_node(node) {
> >  		struct io_wqe *wqe;
> > diff --git a/fs/io-wq.h b/fs/io-wq.h
> > index 071f1a997800..6acc3a04c38f 100644
> > --- a/fs/io-wq.h
> > +++ b/fs/io-wq.h
> > @@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);
> >  
> >  struct io_wq_data {
> >  	struct user_struct *user;
> > +	const struct cred *creds;
> >  
> >  	io_wq_work_fn *do_work;
> >  	free_work_fn *free_work;
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 493e5047e67c..e419923968b3 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
> >  	int ret = 0;
> >  
> >  	data.user = ctx->user;
> > +	data.creds = ctx->creds;
> >  	data.free_work = io_free_work;
> >  	data.do_work = io_wq_submit_work;
> >  
> > diff --git a/include/linux/cred.h b/include/linux/cred.h
> > index 18639c069263..43aee68d117f 100644
> > --- a/include/linux/cred.h
> > +++ b/include/linux/cred.h
> > @@ -144,6 +144,7 @@ struct cred {
> >  #endif
> >  	struct user_struct *user;	/* real user ID subscription */
> >  	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
> > +	struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
> >  	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
> >  	/* RCU deletion */
> >  	union {
> > @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
> >  extern int set_create_files_as(struct cred *, struct inode *);
> >  extern int cred_fscmp(const struct cred *, const struct cred *);
> >  extern void __init cred_init(void);
> > +extern int set_rlimit_ns(struct user_namespace *ns);
> >  
> >  /*
> >   * check for validity of credentials
> > @@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
> >  
> >  #define task_uid(task)		(task_cred_xxx((task), uid))
> >  #define task_euid(task)		(task_cred_xxx((task), euid))
> > +#define task_rlimit_ns(task)	(task_cred_xxx((task), rlimit_ns))
> >  
> >  #define current_cred_xxx(xxx)			\
> >  ({						\
> > @@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
> >  extern struct user_namespace init_user_ns;
> >  #ifdef CONFIG_USER_NS
> >  #define current_user_ns()	(current_cred_xxx(user_ns))
> > +#define current_rlimit_ns()	(current_cred_xxx(rlimit_ns))
> >  #else
> >  static inline struct user_namespace *current_user_ns(void)
> >  {
> >  	return &init_user_ns;
> >  }
> > +static inline struct user_namespace *current_rlimit_ns(void)
> > +{
> > +	return &init_user_ns;
> > +}
> >  #endif
> >  
> >  
> > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> > index 07b4f8131e36..4f853f903415 100644
> > --- a/include/uapi/linux/prctl.h
> > +++ b/include/uapi/linux/prctl.h
> > @@ -238,4 +238,9 @@ struct prctl_mm_map {
> >  #define PR_SET_IO_FLUSHER		57
> >  #define PR_GET_IO_FLUSHER		58
> >  
> > +#define PR_SET_RLIMIT_USER_NAMESPACE	59
> > +#define PR_GET_RLIMIT_USER_NAMESPACE	60
> > +# define PR_RLIMIT_BIND_GLOBAL_USERNS	(1UL << 0)
> > +# define PR_RLIMIT_BIND_CURRENT_USERNS	(1UL << 1)
> > +
> >  #endif /* _LINUX_PRCTL_H */
> > diff --git a/kernel/cred.c b/kernel/cred.c
> > index 748704db1f6b..7b90e1ef9c9a 100644
> > --- a/kernel/cred.c
> > +++ b/kernel/cred.c
> > @@ -59,6 +59,7 @@ struct cred init_cred = {
> >  	.cap_bset		= CAP_FULL_SET,
> >  	.user			= INIT_USER,
> >  	.user_ns		= &init_user_ns,
> > +	.rlimit_ns		= &init_user_ns,
> >  	.group_info		= &init_groups,
> >  };
> >  
> > @@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
> >  		put_group_info(cred->group_info);
> >  	free_uid(cred->user);
> >  	put_user_ns(cred->user_ns);
> > +	put_user_ns(cred->rlimit_ns);
> >  	kmem_cache_free(cred_jar, cred);
> >  }
> >  
> > @@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
> >  	get_group_info(new->group_info);
> >  	get_uid(new->user);
> >  	get_user_ns(new->user_ns);
> > +	get_user_ns(new->rlimit_ns);
> >  
> >  #ifdef CONFIG_KEYS
> >  	key_get(new->session_keyring);
> > @@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> >  #endif
> >  		clone_flags & CLONE_THREAD
> >  	    ) {
> > -		if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
> > +		if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
> >  			return -EACCES;
> >  		p->real_cred = get_cred(p->cred);
> >  		get_cred(p->cred);
> > @@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> >  	}
> >  #endif
> >  
> > -	if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > +	if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> >  		return -EACCES;
> >  	p->cred = p->real_cred = get_cred(new);
> >  	alter_cred_subscribers(new, 2);
> > @@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
> >  	 * perhaps this limit is exceeded in the parent user namespace.
> >  	 */
> >  	alter_cred_subscribers(new, 2);
> > -	if (new->user != old->user &&
> > -	    !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> > +	if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
> > +	    !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
> >  		task->flags |= PF_NPROC_UNS_EXCEEDED;
> >  	rcu_assign_pointer(task->real_cred, new);
> >  	rcu_assign_pointer(task->cred, new);
> > -	if (new->user != old->user)
> > -		dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> > +	if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
> > +		dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
> >  	alter_cred_subscribers(old, -2);
> >  
> >  	/* send notifications */
> > @@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
> >  }
> >  EXPORT_SYMBOL(set_create_files_as);
> >  
> > +/*
> > + * Change the rlimit user namespace of the current task, replacing the existing
> > + * one. If the given namespace is NULL, then initial user namespace will be
> > + * used.
> > + *
> > + * Return: 0 on success; -errno on failure.
> > + */
> > +int set_rlimit_ns(struct user_namespace *ns)
> > +{
> > +	struct cred *new;
> > +
> > +	new = prepare_creds();
> > +	if (!new)
> > +		return -ENOMEM;
> > +
> > +	new->rlimit_ns = ns ? ns : &init_user_ns;
> > +
> > +	return commit_creds(new);
> > +}
> > +
> >  #ifdef CONFIG_DEBUG_CREDENTIALS
> >  
> >  bool creds_are_invalid(const struct cred *cred)
> > diff --git a/kernel/exit.c b/kernel/exit.c
> > index 5a0d7dd1ad64..998436d32373 100644
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
> >  	/* don't need to get the RCU readlock here - the process is dead and
> >  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
> >  	rcu_read_lock();
> > -	dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
> > +	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> >  	rcu_read_unlock();
> >  
> >  	cgroup_release(p);
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index d2b28634dc8f..43f3c54fe4c6 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
> >  		current->flags &= ~PF_NPROC_UNS_EXCEEDED;
> >  		goto bad_fork_free;
> >  	}
> > -	processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
> > +	processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
> >  			UCOUNT_RLIMIT_NPROC);
> >  	if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
> >  		if (p->real_cred->user != INIT_USER &&
> > @@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
> >  #endif
> >  	delayacct_tsk_free(p);
> >  bad_fork_cleanup_count:
> > -	dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
> > +	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
> >  	exit_creds(p);
> >  bad_fork_free:
> >  	p->state = TASK_DEAD;
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index db780ec32d86..917cbd7fc674 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -467,7 +467,7 @@ static int set_user(struct cred *new)
> >  	if (!new_user)
> >  		return -EAGAIN;
> >  
> > -	processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> > +	processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
> >  
> >  	/*
> >  	 * We don't fail in case of NPROC limit excess here because too many
> > @@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >  
> >  		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> >  		break;
> > +	case PR_SET_RLIMIT_USER_NAMESPACE:
> > +		if (!capable(CAP_SYS_RESOURCE))
> > +			return -EPERM;
> Can you have CAP_SYS_RESOURCE in the init user ns while you're in a non-init 
> user ns? Shouldn't that only matter if you want to set yourself to 
> PR_RLIMIT_BIND_GLOBAL_USERNS anyways and you just want to make sure they have 
> CAP_SYS_RESOURCE in the current namespace for PR_RLIMIT_BIND_CURRENT_USERNS?

I think I was wrong to add the CAP_SYS_RESOURCE check because the user
still won't be able to exceed his limit. So since setrlimit does not
require CAP_SYS_RESOURCE, then it is not needed here either.

> > +
> > +		switch (arg2) {
> > +		case PR_RLIMIT_BIND_GLOBAL_USERNS:
> > +			error = set_rlimit_ns(&init_user_ns);
> > +			break;
> > +		case PR_RLIMIT_BIND_CURRENT_USERNS:
> > +			error = set_rlimit_ns(current_user_ns());
> > +			break;
> To some degree, this isn't so much "per user namespace rlimits", as much as it 
> is hierarchical rlimits. I'm not going to nitpick about names, but it might be 
> worth exploring. Especially because the way the patch is written, it would be 
> easy to introduce a third user namespace for rlimits with different mappings.

I agree that the naming is a bit confusing but I couldn't think of a
better one. This is probably because the discussion started with the topic
"per userns rlimits".

This change is more about the rlimit counter but not the rlimit itself.

> The downside I see with a sysctl is that changing it midway through the user 
> namespaces lifetime could be confusing, and difficult to get right.

sysctl requires procfs. I did prctl to avoid it.

> > +		default:
> > +			error = -EINVAL;
> > +		}
> > +		break;
> > +	case PR_GET_RLIMIT_USER_NAMESPACE:
> > +		error = current_rlimit_ns() == &init_user_ns
> > +			? PR_RLIMIT_BIND_GLOBAL_USERNS
> > +			: PR_RLIMIT_BIND_CURRENT_USERNS;
> > +		break;
> It would be nice to have this be a sysctl, so everyone who does
> setns() would get the behaviour, and if a process wants to 

What's so tricky about using prctl?

> >  	default:
> >  		error = -EINVAL;
> >  		break;
> > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> > index 87804e0371fe..346df35ceba9 100644
> > --- a/kernel/user_namespace.c
> > +++ b/kernel/user_namespace.c
> > @@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
> >  #endif
> >  	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
> >  	cred->user_ns = user_ns;
> > +
> > +	cred->rlimit_ns = &init_user_ns;
> >  }
> >  
> >  /*
> > @@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
> >  	for (i = 0; i < UCOUNT_COUNTS; i++) {
> >  		ns->ucount_max[i] = INT_MAX;
> >  	}
> > +	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
> >  	ns->ucounts = ucounts;
> >  
> >  	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
> > -- 
> > 2.25.4
> > 
>
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index c45dfc716394..574b1381276c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1837,7 +1837,7 @@  static int __do_execve_file(int fd, struct filename *filename,
 		goto out_ret;
 	}
 
-	processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
+	processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);
 
 	/*
 	 * We move the actual failure in case of RLIMIT_NPROC excess from
diff --git a/fs/io-wq.c b/fs/io-wq.c
index c3b0843abc9b..19e43ec115cb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -116,6 +116,7 @@  struct io_wq {
 
 	struct task_struct *manager;
 	struct user_struct *user;
+	const struct cred *creds;
 	refcount_t refs;
 	struct completion done;
 
@@ -217,7 +218,7 @@  static void io_worker_exit(struct io_worker *worker)
 	if (worker->flags & IO_WORKER_F_RUNNING)
 		atomic_dec(&acct->nr_running);
 	if (!(worker->flags & IO_WORKER_F_BOUND))
-		dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+		dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
 	worker->flags = 0;
 	preempt_enable();
 
@@ -350,9 +351,9 @@  static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 			worker->flags |= IO_WORKER_F_BOUND;
 			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
 			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
-			dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+			dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
 		} else {
-			if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
+			if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
 				return;
 			worker->flags &= ~IO_WORKER_F_BOUND;
 			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
@@ -662,7 +663,7 @@  static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	}
 
 	if (index == IO_WQ_ACCT_UNBOUND &&
-	    !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
+	    !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
 		kfree(worker);
 		return false;
 	}
@@ -772,7 +773,8 @@  static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
 	if (free_worker)
 		return true;
 
-	processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+	processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
+			UCOUNT_RLIMIT_NPROC);
 
 	if (processes >= acct->max_workers &&
 	    !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
@@ -1049,6 +1051,7 @@  struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 	/* caller must already hold a reference to this */
 	wq->user = data->user;
+	wq->creds = data->creds;
 
 	for_each_node(node) {
 		struct io_wqe *wqe;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 071f1a997800..6acc3a04c38f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -105,6 +105,7 @@  typedef void (io_wq_work_fn)(struct io_wq_work **);
 
 struct io_wq_data {
 	struct user_struct *user;
+	const struct cred *creds;
 
 	io_wq_work_fn *do_work;
 	free_work_fn *free_work;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 493e5047e67c..e419923968b3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6933,6 +6933,7 @@  static int io_init_wq_offload(struct io_ring_ctx *ctx,
 	int ret = 0;
 
 	data.user = ctx->user;
+	data.creds = ctx->creds;
 	data.free_work = io_free_work;
 	data.do_work = io_wq_submit_work;
 
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..43aee68d117f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -144,6 +144,7 @@  struct cred {
 #endif
 	struct user_struct *user;	/* real user ID subscription */
 	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+	struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	/* RCU deletion */
 	union {
@@ -170,6 +171,7 @@  extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
 extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
+extern int set_rlimit_ns(struct user_namespace *ns);
 
 /*
  * check for validity of credentials
@@ -370,6 +372,7 @@  static inline void put_cred(const struct cred *_cred)
 
 #define task_uid(task)		(task_cred_xxx((task), uid))
 #define task_euid(task)		(task_cred_xxx((task), euid))
+#define task_rlimit_ns(task)	(task_cred_xxx((task), rlimit_ns))
 
 #define current_cred_xxx(xxx)			\
 ({						\
@@ -390,11 +393,16 @@  static inline void put_cred(const struct cred *_cred)
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
 #define current_user_ns()	(current_cred_xxx(user_ns))
+#define current_rlimit_ns()	(current_cred_xxx(rlimit_ns))
 #else
 static inline struct user_namespace *current_user_ns(void)
 {
 	return &init_user_ns;
 }
+static inline struct user_namespace *current_rlimit_ns(void)
+{
+	return &init_user_ns;
+}
 #endif
 
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 07b4f8131e36..4f853f903415 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -238,4 +238,9 @@  struct prctl_mm_map {
 #define PR_SET_IO_FLUSHER		57
 #define PR_GET_IO_FLUSHER		58
 
+#define PR_SET_RLIMIT_USER_NAMESPACE	59
+#define PR_GET_RLIMIT_USER_NAMESPACE	60
+# define PR_RLIMIT_BIND_GLOBAL_USERNS	(1UL << 0)
+# define PR_RLIMIT_BIND_CURRENT_USERNS	(1UL << 1)
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 748704db1f6b..7b90e1ef9c9a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -59,6 +59,7 @@  struct cred init_cred = {
 	.cap_bset		= CAP_FULL_SET,
 	.user			= INIT_USER,
 	.user_ns		= &init_user_ns,
+	.rlimit_ns		= &init_user_ns,
 	.group_info		= &init_groups,
 };
 
@@ -120,6 +121,7 @@  static void put_cred_rcu(struct rcu_head *rcu)
 		put_group_info(cred->group_info);
 	free_uid(cred->user);
 	put_user_ns(cred->user_ns);
+	put_user_ns(cred->rlimit_ns);
 	kmem_cache_free(cred_jar, cred);
 }
 
@@ -270,6 +272,7 @@  struct cred *prepare_creds(void)
 	get_group_info(new->group_info);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
+	get_user_ns(new->rlimit_ns);
 
 #ifdef CONFIG_KEYS
 	key_get(new->session_keyring);
@@ -345,7 +348,7 @@  int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
-		if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
+		if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
 			return -EACCES;
 		p->real_cred = get_cred(p->cred);
 		get_cred(p->cred);
@@ -385,7 +388,7 @@  int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	}
 #endif
 
-	if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+	if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
 		return -EACCES;
 	p->cred = p->real_cred = get_cred(new);
 	alter_cred_subscribers(new, 2);
@@ -487,13 +490,13 @@  int commit_creds(struct cred *new)
 	 * perhaps this limit is exceeded in the parent user namespace.
 	 */
 	alter_cred_subscribers(new, 2);
-	if (new->user != old->user &&
-	    !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+	if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
+	    !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
 		task->flags |= PF_NPROC_UNS_EXCEEDED;
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
-	if (new->user != old->user)
-		dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
+	if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
+		dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
 	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
@@ -789,6 +792,26 @@  int set_create_files_as(struct cred *new, struct inode *inode)
 }
 EXPORT_SYMBOL(set_create_files_as);
 
+/*
+ * Change the rlimit user namespace of the current task, replacing the existing
+ * one. If the given namespace is NULL, then initial user namespace will be
+ * used.
+ *
+ * Return: 0 on success; -errno on failure.
+ */
+int set_rlimit_ns(struct user_namespace *ns)
+{
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->rlimit_ns = ns ? ns : &init_user_ns;
+
+	return commit_creds(new);
+}
+
 #ifdef CONFIG_DEBUG_CREDENTIALS
 
 bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5a0d7dd1ad64..998436d32373 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -187,7 +187,7 @@  void release_task(struct task_struct *p)
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
-	dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
+	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
 	rcu_read_unlock();
 
 	cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2b28634dc8f..43f3c54fe4c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1963,7 +1963,7 @@  static __latent_entropy struct task_struct *copy_process(
 		current->flags &= ~PF_NPROC_UNS_EXCEEDED;
 		goto bad_fork_free;
 	}
-	processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
+	processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
 			UCOUNT_RLIMIT_NPROC);
 	if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
 		if (p->real_cred->user != INIT_USER &&
@@ -2366,7 +2366,7 @@  static __latent_entropy struct task_struct *copy_process(
 #endif
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
-	dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
+	dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
 	exit_creds(p);
 bad_fork_free:
 	p->state = TASK_DEAD;
diff --git a/kernel/sys.c b/kernel/sys.c
index db780ec32d86..917cbd7fc674 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -467,7 +467,7 @@  static int set_user(struct cred *new)
 	if (!new_user)
 		return -EAGAIN;
 
-	processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
+	processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
 
 	/*
 	 * We don't fail in case of NPROC limit excess here because too many
@@ -2529,6 +2529,26 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
 		break;
+	case PR_SET_RLIMIT_USER_NAMESPACE:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		switch (arg2) {
+		case PR_RLIMIT_BIND_GLOBAL_USERNS:
+			error = set_rlimit_ns(&init_user_ns);
+			break;
+		case PR_RLIMIT_BIND_CURRENT_USERNS:
+			error = set_rlimit_ns(current_user_ns());
+			break;
+		default:
+			error = -EINVAL;
+		}
+		break;
+	case PR_GET_RLIMIT_USER_NAMESPACE:
+		error = current_rlimit_ns() == &init_user_ns
+			? PR_RLIMIT_BIND_GLOBAL_USERNS
+			: PR_RLIMIT_BIND_CURRENT_USERNS;
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 87804e0371fe..346df35ceba9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -56,6 +56,8 @@  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 #endif
 	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 	cred->user_ns = user_ns;
+
+	cred->rlimit_ns = &init_user_ns;
 }
 
 /*
@@ -121,6 +123,7 @@  int create_user_ns(struct cred *new)
 	for (i = 0; i < UCOUNT_COUNTS; i++) {
 		ns->ucount_max[i] = INT_MAX;
 	}
+	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
 	ns->ucounts = ucounts;
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */