diff mbox series

[v8,07/11] proc: flush task dcache entries from all procfs instances

Message ID 20200210150519.538333-8-gladkov.alexey@gmail.com (mailing list archive)
State New, archived
Headers show
Series proc: modernize proc to support multiple private instances | expand

Commit Message

Alexey Gladkov Feb. 10, 2020, 3:05 p.m. UTC
This allows to flush dcache entries of a task on multiple procfs mounts
per pid namespace.

The RCU lock is used because the number of reads at the task exit time
is much larger than the number of procfs mounts.

Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
---
 fs/proc/base.c                | 20 +++++++++++++++-----
 fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
 include/linux/pid_namespace.h |  2 ++
 include/linux/proc_fs.h       |  2 ++
 4 files changed, 45 insertions(+), 6 deletions(-)

Comments

Linus Torvalds Feb. 10, 2020, 5:46 p.m. UTC | #1
On Mon, Feb 10, 2020 at 7:06 AM Alexey Gladkov <gladkov.alexey@gmail.com> wrote:
>
> This allows to flush dcache entries of a task on multiple procfs mounts
> per pid namespace.
>
> The RCU lock is used because the number of reads at the task exit time
> is much larger than the number of procfs mounts.

Ok, this looks better to me than the previous version.

But that may be the "pee-in-the-snow" effect, and I _really_ want
others to take a good look at the whole series.

The right people seem to be cc'd, but this is pretty core, and /proc
has a tendency to cause interesting issues because of how it's
involved in a lot of areas indirectly.

Al, Oleg, Andy, Eric?

             Linus
Al Viro Feb. 10, 2020, 7:23 p.m. UTC | #2
On Mon, Feb 10, 2020 at 09:46:26AM -0800, Linus Torvalds wrote:
> On Mon, Feb 10, 2020 at 7:06 AM Alexey Gladkov <gladkov.alexey@gmail.com> wrote:
> >
> > This allows to flush dcache entries of a task on multiple procfs mounts
> > per pid namespace.
> >
> > The RCU lock is used because the number of reads at the task exit time
> > is much larger than the number of procfs mounts.
> 
> Ok, this looks better to me than the previous version.
> 
> But that may be the "pee-in-the-snow" effect, and I _really_ want
> others to take a good look at the whole series.
> 
> The right people seem to be cc'd, but this is pretty core, and /proc
> has a tendency to cause interesting issues because of how it's
> involved in a lot of areas indirectly.
> 
> Al, Oleg, Andy, Eric?

Will check tonight (ears-deep in sorting out the old branches right now)
Eric W. Biederman Feb. 11, 2020, 1:36 a.m. UTC | #3
Alexey Gladkov <gladkov.alexey@gmail.com> writes:

> This allows to flush dcache entries of a task on multiple procfs mounts
> per pid namespace.
>
> The RCU lock is used because the number of reads at the task exit time
> is much larger than the number of procfs mounts.

A couple of quick comments.

> Cc: Kees Cook <keescook@chromium.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> ---
>  fs/proc/base.c                | 20 +++++++++++++++-----
>  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
>  include/linux/pid_namespace.h |  2 ++
>  include/linux/proc_fs.h       |  2 ++
>  4 files changed, 45 insertions(+), 6 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 4ccb280a3e79..24b7c620ded3 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>  	.permission	= proc_pid_permission,
>  };
>  
> -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
Perhaps just rename things like:
> +static void proc_flush_task_root(struct dentry *root, pid_t pid, pid_t tgid)
>  {

I don't think the mnt_ prefix conveys any information, and it certainly
makes everything longer and more cumbersome.

>  	struct dentry *dentry, *leader, *dir;
>  	char buf[10 + 1];
> @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>  	name.name = buf;
>  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
>  	/* no ->d_hash() rejects on procfs */
> -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
> +	dentry = d_hash_and_lookup(mnt_root, &name);
>  	if (dentry) {
>  		d_invalidate(dentry);
>  		dput(dentry);
> @@ -3153,7 +3153,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>  
>  	name.name = buf;
>  	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
> -	leader = d_hash_and_lookup(mnt->mnt_root, &name);
> +	leader = d_hash_and_lookup(mnt_root, &name);
>  	if (!leader)
>  		goto out;
>  
> @@ -3208,14 +3208,24 @@ void proc_flush_task(struct task_struct *task)
>  	int i;
>  	struct pid *pid, *tgid;
>  	struct upid *upid;
> +	struct dentry *mnt_root;
> +	struct proc_fs_info *fs_info;
>  
>  	pid = task_pid(task);
>  	tgid = task_tgid(task);
>  
>  	for (i = 0; i <= pid->level; i++) {
>  		upid = &pid->numbers[i];
> -		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> -					tgid->numbers[i].nr);
> +
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
> +			mnt_root = fs_info->m_super->s_root;
> +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
> +		}
> +		rcu_read_unlock();
> +
> +		mnt_root = upid->ns->proc_mnt->mnt_root;
> +		proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);

I don't think this following of proc_mnt is needed.  It certainly
shouldn't be.  The loop through all of the super blocks should be
enough.

Once this change goes through.  UML can be given it's own dedicated
proc_mnt for the initial pid namespace, and proc_mnt can be removed
entirely.

Unless something has changed recently UML is the only other user of
pid_ns->proc_mnt.  That proc_mnt really only exists to make the loop in
proc_flush_task easy to write.

It also probably makes sense to take the rcu_read_lock() over
that entire for loop.


>  	}
>  }
>  
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index 5d5cba4c899b..e2bb015da1a8 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -149,7 +149,22 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
>  	if (ret) {
>  		return ret;
>  	}
> -	return proc_setup_thread_self(s);
> +
> +	ret = proc_setup_thread_self(s);
> +	if (ret) {
> +		return ret;
> +	}
> +
> +	/*
> +	 * back reference to flush dcache entries at process exit time.
> +	 */
> +	ctx->fs_info->m_super = s;
> +
> +	spin_lock(&pid_ns->proc_mounts_lock);
> +	list_add_tail_rcu(&ctx->fs_info->pidns_entry, &pid_ns->proc_mounts);
> +	spin_unlock(&pid_ns->proc_mounts_lock);
> +
> +	return 0;
>  }
>  
>  static int proc_reconfigure(struct fs_context *fc)
> @@ -211,10 +226,17 @@ static void proc_kill_sb(struct super_block *sb)
>  {
>  	struct proc_fs_info *fs_info = proc_sb_info(sb);
>  
> +	spin_lock(&fs_info->pid_ns->proc_mounts_lock);
> +	list_del_rcu(&fs_info->pidns_entry);
> +	spin_unlock(&fs_info->pid_ns->proc_mounts_lock);
> +
> +	synchronize_rcu();
> +

Rather than a heavyweight synchronize_rcu here,
it probably makes sense to instead to change

	kfree(fs_info)
to
	kfree_rcu(fs_info, some_rcu_head_in_fs_info)

Or possibly doing a full call_rcu.

The problem is that synchronize_rcu takes about 10ms when HZ=100.
Which makes synchronize_rcu incredibly expensive on any path where
anything can wait for it.

>  	if (fs_info->proc_self)
>  		dput(fs_info->proc_self);
>  	if (fs_info->proc_thread_self)
>  		dput(fs_info->proc_thread_self);
> +
>  	kill_anon_super(sb);
>  	put_pid_ns(fs_info->pid_ns);
>  	kfree(fs_info);
> @@ -336,6 +358,9 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
>  		ctx->fs_info->pid_ns = ns;
>  	}
>  
> +	spin_lock_init(&ns->proc_mounts_lock);
> +	INIT_LIST_HEAD_RCU(&ns->proc_mounts);
> +
>  	mnt = fc_mount(fc);
>  	put_fs_context(fc);
>  	if (IS_ERR(mnt))
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 66f47f1afe0d..c36af1dfd862 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -26,6 +26,8 @@ struct pid_namespace {
>  	struct pid_namespace *parent;
>  #ifdef CONFIG_PROC_FS
>  	struct vfsmount *proc_mnt; /* Internal proc mounted during each new pidns */
> +	spinlock_t proc_mounts_lock;
> +	struct list_head proc_mounts; /* list of separated procfs mounts */
>  #endif
>  #ifdef CONFIG_BSD_PROCESS_ACCT
>  	struct fs_pin *bacct;
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 5f0b1b7e4271..f307940f8311 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -20,6 +20,8 @@ enum {
>  };
>  
>  struct proc_fs_info {
> +	struct list_head pidns_entry;    /* Node in procfs_mounts of a pidns */
> +	struct super_block *m_super;
>  	struct pid_namespace *pid_ns;
>  	struct dentry *proc_self;        /* For /proc/self */
>  	struct dentry *proc_thread_self; /* For /proc/thread-self */


Eric
Eric W. Biederman Feb. 11, 2020, 4:01 a.m. UTC | #4
ebiederm@xmission.com (Eric W. Biederman) writes:

> Alexey Gladkov <gladkov.alexey@gmail.com> writes:
>
>> This allows to flush dcache entries of a task on multiple procfs mounts
>> per pid namespace.
>>
>> The RCU lock is used because the number of reads at the task exit time
>> is much larger than the number of procfs mounts.
>
> A couple of quick comments.
>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
>> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
>> Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
>> ---
>>  fs/proc/base.c                | 20 +++++++++++++++-----
>>  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
>>  include/linux/pid_namespace.h |  2 ++
>>  include/linux/proc_fs.h       |  2 ++
>>  4 files changed, 45 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 4ccb280a3e79..24b7c620ded3 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>>  	.permission	= proc_pid_permission,
>>  };
>>  
>> -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>> +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
> Perhaps just rename things like:
>> +static void proc_flush_task_root(struct dentry *root, pid_t pid, pid_t tgid)
>>  {
>
> I don't think the mnt_ prefix conveys any information, and it certainly
> makes everything longer and more cumbersome.
>
>>  	struct dentry *dentry, *leader, *dir;
>>  	char buf[10 + 1];
>> @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>>  	name.name = buf;
>>  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
>>  	/* no ->d_hash() rejects on procfs */
>> -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
>> +	dentry = d_hash_and_lookup(mnt_root, &name);
>>  	if (dentry) {
>>  		d_invalidate(dentry);
>>  		dput(dentry);
>> @@ -3153,7 +3153,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>>  
>>  	name.name = buf;
>>  	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
>> -	leader = d_hash_and_lookup(mnt->mnt_root, &name);
>> +	leader = d_hash_and_lookup(mnt_root, &name);
>>  	if (!leader)
>>  		goto out;
>>  
>> @@ -3208,14 +3208,24 @@ void proc_flush_task(struct task_struct *task)
>>  	int i;
>>  	struct pid *pid, *tgid;
>>  	struct upid *upid;
>> +	struct dentry *mnt_root;
>> +	struct proc_fs_info *fs_info;
>>  
>>  	pid = task_pid(task);
>>  	tgid = task_tgid(task);
>>  
>>  	for (i = 0; i <= pid->level; i++) {
>>  		upid = &pid->numbers[i];
>> -		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
>> -					tgid->numbers[i].nr);
>> +
>> +		rcu_read_lock();
>> +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
>> +			mnt_root = fs_info->m_super->s_root;
>> +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
>> +		}
>> +		rcu_read_unlock();
>> +
>> +		mnt_root = upid->ns->proc_mnt->mnt_root;
>> +		proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
>
> I don't think this following of proc_mnt is needed.  It certainly
> shouldn't be.  The loop through all of the super blocks should be
> enough.
>
> Once this change goes through.  UML can be given it's own dedicated
> proc_mnt for the initial pid namespace, and proc_mnt can be removed
> entirely.
>
> Unless something has changed recently UML is the only other user of
> pid_ns->proc_mnt.  That proc_mnt really only exists to make the loop in
> proc_flush_task easy to write.
>
> It also probably makes sense to take the rcu_read_lock() over
> that entire for loop.
>
>
>>  	}
>>  }
>>  
>> diff --git a/fs/proc/root.c b/fs/proc/root.c
>> index 5d5cba4c899b..e2bb015da1a8 100644
>> --- a/fs/proc/root.c
>> +++ b/fs/proc/root.c
>> @@ -149,7 +149,22 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
>>  	if (ret) {
>>  		return ret;
>>  	}
>> -	return proc_setup_thread_self(s);
>> +
>> +	ret = proc_setup_thread_self(s);
>> +	if (ret) {
>> +		return ret;
>> +	}
>> +
>> +	/*
>> +	 * back reference to flush dcache entries at process exit time.
>> +	 */
>> +	ctx->fs_info->m_super = s;
>> +
>> +	spin_lock(&pid_ns->proc_mounts_lock);
>> +	list_add_tail_rcu(&ctx->fs_info->pidns_entry, &pid_ns->proc_mounts);
>> +	spin_unlock(&pid_ns->proc_mounts_lock);
>> +
>> +	return 0;
>>  }
>>  
>>  static int proc_reconfigure(struct fs_context *fc)
>> @@ -211,10 +226,17 @@ static void proc_kill_sb(struct super_block *sb)
>>  {
>>  	struct proc_fs_info *fs_info = proc_sb_info(sb);
>>  
>> +	spin_lock(&fs_info->pid_ns->proc_mounts_lock);
>> +	list_del_rcu(&fs_info->pidns_entry);
>> +	spin_unlock(&fs_info->pid_ns->proc_mounts_lock);
>> +
>> +	synchronize_rcu();
>> +
>
> Rather than a heavyweight synchronize_rcu here,
> it probably makes sense to instead to change
>
> 	kfree(fs_info)
> to
> 	kfree_rcu(fs_info, some_rcu_head_in_fs_info)
>
> Or possibly doing a full call_rcu.
>
> The problem is that synchronize_rcu takes about 10ms when HZ=100.
> Which makes synchronize_rcu incredibly expensive on any path where
> anything can wait for it.

I take it back.

The crucial thing to insert a rcu delay before is
shrink_dcache_for_umount.

With the call chain:
kill_anon_super
    generic_shutdown_super
        shrink_dcache_for_umount.

So we do need synchronize_rcu or to put everything below this point into
a call_rcu function.

The practical concern is mntput calls does task_work_add.
The task_work function is cleanup_mnt.
When called cleanup_mnt calls deactivate_locked_super
which in turn calls proc_kill_sb.

Which is a long way of saying that the code will delay the
exit of the process that calls mntput by 10ms (or
however syncrhonize_rcu runs).

We probably don't care.  But that is long enough in to be user
visible, and potentially cause problems.

But all it requires putting the code below in it's own
function and triggering it with call_rcu if that causes a regression
by bying so slow.

>>  	if (fs_info->proc_self)
>>  		dput(fs_info->proc_self);
>>  	if (fs_info->proc_thread_self)
>>  		dput(fs_info->proc_thread_self);
>> +
>>  	kill_anon_super(sb);
>>  	put_pid_ns(fs_info->pid_ns);
>>  	kfree(fs_info);
>> @@ -336,6 +358,9 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
>>  		ctx->fs_info->pid_ns = ns;
>>  	}
>>  
>> +	spin_lock_init(&ns->proc_mounts_lock);
>> +	INIT_LIST_HEAD_RCU(&ns->proc_mounts);
>> +
>>  	mnt = fc_mount(fc);
>>  	put_fs_context(fc);
>>  	if (IS_ERR(mnt))
>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>> index 66f47f1afe0d..c36af1dfd862 100644
>> --- a/include/linux/pid_namespace.h
>> +++ b/include/linux/pid_namespace.h
>> @@ -26,6 +26,8 @@ struct pid_namespace {
>>  	struct pid_namespace *parent;
>>  #ifdef CONFIG_PROC_FS
>>  	struct vfsmount *proc_mnt; /* Internal proc mounted during each new pidns */
>> +	spinlock_t proc_mounts_lock;
>> +	struct list_head proc_mounts; /* list of separated procfs mounts */
>>  #endif
>>  #ifdef CONFIG_BSD_PROCESS_ACCT
>>  	struct fs_pin *bacct;
>> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
>> index 5f0b1b7e4271..f307940f8311 100644
>> --- a/include/linux/proc_fs.h
>> +++ b/include/linux/proc_fs.h
>> @@ -20,6 +20,8 @@ enum {
>>  };
>>  
>>  struct proc_fs_info {
>> +	struct list_head pidns_entry;    /* Node in procfs_mounts of a pidns */
>> +	struct super_block *m_super;
>>  	struct pid_namespace *pid_ns;
>>  	struct dentry *proc_self;        /* For /proc/self */
>>  	struct dentry *proc_thread_self; /* For /proc/thread-self */
>
>
> Eric
Al Viro Feb. 11, 2020, 10:45 p.m. UTC | #5
On Mon, Feb 10, 2020 at 04:05:15PM +0100, Alexey Gladkov wrote:
> This allows to flush dcache entries of a task on multiple procfs mounts
> per pid namespace.
> 
> The RCU lock is used because the number of reads at the task exit time
> is much larger than the number of procfs mounts.
> 
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> ---
>  fs/proc/base.c                | 20 +++++++++++++++-----
>  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
>  include/linux/pid_namespace.h |  2 ++
>  include/linux/proc_fs.h       |  2 ++
>  4 files changed, 45 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 4ccb280a3e79..24b7c620ded3 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>  	.permission	= proc_pid_permission,
>  };
>  
> -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
>  {
>  	struct dentry *dentry, *leader, *dir;
>  	char buf[10 + 1];
> @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>  	name.name = buf;
>  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
>  	/* no ->d_hash() rejects on procfs */
> -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
> +	dentry = d_hash_and_lookup(mnt_root, &name);
>  	if (dentry) {
>  		d_invalidate(dentry);
... which can block
>  		dput(dentry);
... and so can this

> +		rcu_read_lock();
> +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
> +			mnt_root = fs_info->m_super->s_root;
> +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);

... making that more than slightly unsafe.
Alexey Gladkov Feb. 12, 2020, 2:26 p.m. UTC | #6
On Tue, Feb 11, 2020 at 10:45:53PM +0000, Al Viro wrote:
> On Mon, Feb 10, 2020 at 04:05:15PM +0100, Alexey Gladkov wrote:
> > This allows to flush dcache entries of a task on multiple procfs mounts
> > per pid namespace.
> > 
> > The RCU lock is used because the number of reads at the task exit time
> > is much larger than the number of procfs mounts.
> > 
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Andy Lutomirski <luto@kernel.org>
> > Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> > Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> > ---
> >  fs/proc/base.c                | 20 +++++++++++++++-----
> >  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
> >  include/linux/pid_namespace.h |  2 ++
> >  include/linux/proc_fs.h       |  2 ++
> >  4 files changed, 45 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 4ccb280a3e79..24b7c620ded3 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> >  	.permission	= proc_pid_permission,
> >  };
> >  
> > -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> > +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
> >  {
> >  	struct dentry *dentry, *leader, *dir;
> >  	char buf[10 + 1];
> > @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> >  	name.name = buf;
> >  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
> >  	/* no ->d_hash() rejects on procfs */
> > -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
> > +	dentry = d_hash_and_lookup(mnt_root, &name);
> >  	if (dentry) {
> >  		d_invalidate(dentry);
> ... which can block
> >  		dput(dentry);
> ... and so can this
> 
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
> > +			mnt_root = fs_info->m_super->s_root;
> > +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
> 
> ... making that more than slightly unsafe.

I see. So, I can't use rcu locks here as well as spinlocks.
Thanks!
Alexey Gladkov Feb. 12, 2020, 2:49 p.m. UTC | #7
On Mon, Feb 10, 2020 at 07:36:08PM -0600, Eric W. Biederman wrote:
> Alexey Gladkov <gladkov.alexey@gmail.com> writes:
> 
> > This allows to flush dcache entries of a task on multiple procfs mounts
> > per pid namespace.
> >
> > The RCU lock is used because the number of reads at the task exit time
> > is much larger than the number of procfs mounts.
> 
> A couple of quick comments.
> 
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Andy Lutomirski <luto@kernel.org>
> > Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> > Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> > ---
> >  fs/proc/base.c                | 20 +++++++++++++++-----
> >  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
> >  include/linux/pid_namespace.h |  2 ++
> >  include/linux/proc_fs.h       |  2 ++
> >  4 files changed, 45 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 4ccb280a3e79..24b7c620ded3 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> >  	.permission	= proc_pid_permission,
> >  };
> >  
> > -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> > +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
> Perhaps just rename things like:
> > +static void proc_flush_task_root(struct dentry *root, pid_t pid, pid_t tgid)
> >  {
> 
> I don't think the mnt_ prefix conveys any information, and it certainly
> makes everything longer and more cumbersome.
> 
> >  	struct dentry *dentry, *leader, *dir;
> >  	char buf[10 + 1];
> > @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> >  	name.name = buf;
> >  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
> >  	/* no ->d_hash() rejects on procfs */
> > -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
> > +	dentry = d_hash_and_lookup(mnt_root, &name);
> >  	if (dentry) {
> >  		d_invalidate(dentry);
> >  		dput(dentry);
> > @@ -3153,7 +3153,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> >  
> >  	name.name = buf;
> >  	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
> > -	leader = d_hash_and_lookup(mnt->mnt_root, &name);
> > +	leader = d_hash_and_lookup(mnt_root, &name);
> >  	if (!leader)
> >  		goto out;
> >  
> > @@ -3208,14 +3208,24 @@ void proc_flush_task(struct task_struct *task)
> >  	int i;
> >  	struct pid *pid, *tgid;
> >  	struct upid *upid;
> > +	struct dentry *mnt_root;
> > +	struct proc_fs_info *fs_info;
> >  
> >  	pid = task_pid(task);
> >  	tgid = task_tgid(task);
> >  
> >  	for (i = 0; i <= pid->level; i++) {
> >  		upid = &pid->numbers[i];
> > -		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> > -					tgid->numbers[i].nr);
> > +
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
> > +			mnt_root = fs_info->m_super->s_root;
> > +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
> > +		}
> > +		rcu_read_unlock();
> > +
> > +		mnt_root = upid->ns->proc_mnt->mnt_root;
> > +		proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
> 
> I don't think this following of proc_mnt is needed.  It certainly
> shouldn't be.  The loop through all of the super blocks should be
> enough.

Yes, thanks!

> Once this change goes through.  UML can be given it's own dedicated
> proc_mnt for the initial pid namespace, and proc_mnt can be removed
> entirely.

After you deleted the old sysctl syscall we could probably do it.

> Unless something has changed recently UML is the only other user of
> pid_ns->proc_mnt.  That proc_mnt really only exists to make the loop in
> proc_flush_task easy to write.

Now I think, is there any way to get rid of proc_mounts or even
proc_flush_task somehow.

> It also probably makes sense to take the rcu_read_lock() over
> that entire for loop.

Al Viro pointed out to me that I cannot use rcu locks here :(
Eric W. Biederman Feb. 12, 2020, 2:59 p.m. UTC | #8
Alexey Gladkov <gladkov.alexey@gmail.com> writes:

> On Mon, Feb 10, 2020 at 07:36:08PM -0600, Eric W. Biederman wrote:
>> Alexey Gladkov <gladkov.alexey@gmail.com> writes:
>> 
>> > This allows to flush dcache entries of a task on multiple procfs mounts
>> > per pid namespace.
>> >
>> > The RCU lock is used because the number of reads at the task exit time
>> > is much larger than the number of procfs mounts.
>> 
>> A couple of quick comments.
>> 
>> > Cc: Kees Cook <keescook@chromium.org>
>> > Cc: Andy Lutomirski <luto@kernel.org>
>> > Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
>> > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
>> > Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
>> > ---
>> >  fs/proc/base.c                | 20 +++++++++++++++-----
>> >  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
>> >  include/linux/pid_namespace.h |  2 ++
>> >  include/linux/proc_fs.h       |  2 ++
>> >  4 files changed, 45 insertions(+), 6 deletions(-)
>> >
>> > diff --git a/fs/proc/base.c b/fs/proc/base.c
>> > index 4ccb280a3e79..24b7c620ded3 100644
>> > --- a/fs/proc/base.c
>> > +++ b/fs/proc/base.c
>> > @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>> >  	.permission	= proc_pid_permission,
>> >  };
>> >  
>> > -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>> > +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
>> Perhaps just rename things like:
>> > +static void proc_flush_task_root(struct dentry *root, pid_t pid, pid_t tgid)
>> >  {
>> 
>> I don't think the mnt_ prefix conveys any information, and it certainly
>> makes everything longer and more cumbersome.
>> 
>> >  	struct dentry *dentry, *leader, *dir;
>> >  	char buf[10 + 1];
>> > @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>> >  	name.name = buf;
>> >  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
>> >  	/* no ->d_hash() rejects on procfs */
>> > -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
>> > +	dentry = d_hash_and_lookup(mnt_root, &name);
>> >  	if (dentry) {
>> >  		d_invalidate(dentry);
>> >  		dput(dentry);
>> > @@ -3153,7 +3153,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>> >  
>> >  	name.name = buf;
>> >  	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
>> > -	leader = d_hash_and_lookup(mnt->mnt_root, &name);
>> > +	leader = d_hash_and_lookup(mnt_root, &name);
>> >  	if (!leader)
>> >  		goto out;
>> >  
>> > @@ -3208,14 +3208,24 @@ void proc_flush_task(struct task_struct *task)
>> >  	int i;
>> >  	struct pid *pid, *tgid;
>> >  	struct upid *upid;
>> > +	struct dentry *mnt_root;
>> > +	struct proc_fs_info *fs_info;
>> >  
>> >  	pid = task_pid(task);
>> >  	tgid = task_tgid(task);
>> >  
>> >  	for (i = 0; i <= pid->level; i++) {
>> >  		upid = &pid->numbers[i];
>> > -		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
>> > -					tgid->numbers[i].nr);
>> > +
>> > +		rcu_read_lock();
>> > +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
>> > +			mnt_root = fs_info->m_super->s_root;
>> > +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
>> > +		}
>> > +		rcu_read_unlock();
>> > +
>> > +		mnt_root = upid->ns->proc_mnt->mnt_root;
>> > +		proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
>> 
>> I don't think this following of proc_mnt is needed.  It certainly
>> shouldn't be.  The loop through all of the super blocks should be
>> enough.
>
> Yes, thanks!
>
>> Once this change goes through.  UML can be given it's own dedicated
>> proc_mnt for the initial pid namespace, and proc_mnt can be removed
>> entirely.
>
> After you deleted the old sysctl syscall we could probably do it.
>
>> Unless something has changed recently UML is the only other user of
>> pid_ns->proc_mnt.  That proc_mnt really only exists to make the loop in
>> proc_flush_task easy to write.
>
> Now I think, is there any way to get rid of proc_mounts or even
> proc_flush_task somehow.
>
>> It also probably makes sense to take the rcu_read_lock() over
>> that entire for loop.
>
> Al Viro pointed out to me that I cannot use rcu locks here :(

Fundamentally proc_flush_task is an optimization.  Just getting rid of
dentries earlier.  At least at one point it was an important
optimization because the old process dentries would just sit around
doing nothing for anyone.

I wonder if instead of invalidating specific dentries we could instead
fire wake up a shrinker and point it at one or more instances of proc.

The practical challenge I see is something might need to access the
dentries to see that they are invalid.

We definitely could try without this optimization and see what happens.

Eric
Alexey Gladkov Feb. 12, 2020, 5:08 p.m. UTC | #9
On Wed, Feb 12, 2020 at 08:59:29AM -0600, Eric W. Biederman wrote:
> Alexey Gladkov <gladkov.alexey@gmail.com> writes:
> 
> > On Mon, Feb 10, 2020 at 07:36:08PM -0600, Eric W. Biederman wrote:
> >> Alexey Gladkov <gladkov.alexey@gmail.com> writes:
> >> 
> >> > This allows to flush dcache entries of a task on multiple procfs mounts
> >> > per pid namespace.
> >> >
> >> > The RCU lock is used because the number of reads at the task exit time
> >> > is much larger than the number of procfs mounts.
> >> 
> >> A couple of quick comments.
> >> 
> >> > Cc: Kees Cook <keescook@chromium.org>
> >> > Cc: Andy Lutomirski <luto@kernel.org>
> >> > Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> >> > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> >> > Signed-off-by: Alexey Gladkov <gladkov.alexey@gmail.com>
> >> > ---
> >> >  fs/proc/base.c                | 20 +++++++++++++++-----
> >> >  fs/proc/root.c                | 27 ++++++++++++++++++++++++++-
> >> >  include/linux/pid_namespace.h |  2 ++
> >> >  include/linux/proc_fs.h       |  2 ++
> >> >  4 files changed, 45 insertions(+), 6 deletions(-)
> >> >
> >> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> >> > index 4ccb280a3e79..24b7c620ded3 100644
> >> > --- a/fs/proc/base.c
> >> > +++ b/fs/proc/base.c
> >> > @@ -3133,7 +3133,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> >> >  	.permission	= proc_pid_permission,
> >> >  };
> >> >  
> >> > -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> >> > +static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
> >> Perhaps just rename things like:
> >> > +static void proc_flush_task_root(struct dentry *root, pid_t pid, pid_t tgid)
> >> >  {
> >> 
> >> I don't think the mnt_ prefix conveys any information, and it certainly
> >> makes everything longer and more cumbersome.
> >> 
> >> >  	struct dentry *dentry, *leader, *dir;
> >> >  	char buf[10 + 1];
> >> > @@ -3142,7 +3142,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> >> >  	name.name = buf;
> >> >  	name.len = snprintf(buf, sizeof(buf), "%u", pid);
> >> >  	/* no ->d_hash() rejects on procfs */
> >> > -	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
> >> > +	dentry = d_hash_and_lookup(mnt_root, &name);
> >> >  	if (dentry) {
> >> >  		d_invalidate(dentry);
> >> >  		dput(dentry);
> >> > @@ -3153,7 +3153,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> >> >  
> >> >  	name.name = buf;
> >> >  	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
> >> > -	leader = d_hash_and_lookup(mnt->mnt_root, &name);
> >> > +	leader = d_hash_and_lookup(mnt_root, &name);
> >> >  	if (!leader)
> >> >  		goto out;
> >> >  
> >> > @@ -3208,14 +3208,24 @@ void proc_flush_task(struct task_struct *task)
> >> >  	int i;
> >> >  	struct pid *pid, *tgid;
> >> >  	struct upid *upid;
> >> > +	struct dentry *mnt_root;
> >> > +	struct proc_fs_info *fs_info;
> >> >  
> >> >  	pid = task_pid(task);
> >> >  	tgid = task_tgid(task);
> >> >  
> >> >  	for (i = 0; i <= pid->level; i++) {
> >> >  		upid = &pid->numbers[i];
> >> > -		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> >> > -					tgid->numbers[i].nr);
> >> > +
> >> > +		rcu_read_lock();
> >> > +		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
> >> > +			mnt_root = fs_info->m_super->s_root;
> >> > +			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
> >> > +		}
> >> > +		rcu_read_unlock();
> >> > +
> >> > +		mnt_root = upid->ns->proc_mnt->mnt_root;
> >> > +		proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
> >> 
> >> I don't think this following of proc_mnt is needed.  It certainly
> >> shouldn't be.  The loop through all of the super blocks should be
> >> enough.
> >
> > Yes, thanks!
> >
> >> Once this change goes through.  UML can be given it's own dedicated
> >> proc_mnt for the initial pid namespace, and proc_mnt can be removed
> >> entirely.
> >
> > After you deleted the old sysctl syscall we could probably do it.
> >
> >> Unless something has changed recently UML is the only other user of
> >> pid_ns->proc_mnt.  That proc_mnt really only exists to make the loop in
> >> proc_flush_task easy to write.
> >
> > Now I think, is there any way to get rid of proc_mounts or even
> > proc_flush_task somehow.
> >
> >> It also probably makes sense to take the rcu_read_lock() over
> >> that entire for loop.
> >
> > Al Viro pointed out to me that I cannot use rcu locks here :(
> 
> Fundamentally proc_flush_task is an optimization.  Just getting rid of
> dentries earlier.  At least at one point it was an important
> optimization because the old process dentries would just sit around
> doing nothing for anyone.
> 
> I wonder if instead of invalidating specific dentries we could instead
> fire wake up a shrinker and point it at one or more instances of proc.
> 
> The practical challenge I see is something might need to access the
> dentries to see that they are invalid.
> 
> We definitely could try without this optimization and see what happens.

When Linus said that a semaphore for proc_mounts is a bad idea, I tried
to come up with some kind of asynchronous way to clear it per superblock.
I gave up with the asynchronous GC because userspace can quite easily get
ahead of it.

Without this optimization the kernel starts to consume a lot of memory
during intensive reading /proc. I tried to do:

while :; do
	for x in `seq 0 9`; do sleep 0.1; done;
	ls /proc/[0-9]*;
done >/dev/null;

and memory consumption went up without proc_flush_task. Since we have
mounted procfs in each container, this is dangerous.
Linus Torvalds Feb. 12, 2020, 6:45 p.m. UTC | #10
On Wed, Feb 12, 2020 at 7:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Fundamentally proc_flush_task is an optimization.  Just getting rid of
> dentries earlier.  At least at one point it was an important
> optimization because the old process dentries would just sit around
> doing nothing for anyone.

I'm pretty sure it's still important. It's very easy to generate a
_ton_ of dentries with /proc.

> I wonder if instead of invalidating specific dentries we could instead
> fire wake up a shrinker and point it at one or more instances of proc.

It shouldn't be the dentries themselves that are a freeing problem.
They're being RCU-free'd anyway because of lookup. It's the
proc_mounts list that is the problem, isn't it?

So it's just fs_info that needs to be rcu-delayed because it contains
that list. Or is there something else?

               Linus
Eric W. Biederman Feb. 12, 2020, 7:16 p.m. UTC | #11
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Wed, Feb 12, 2020 at 7:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Fundamentally proc_flush_task is an optimization.  Just getting rid of
>> dentries earlier.  At least at one point it was an important
>> optimization because the old process dentries would just sit around
>> doing nothing for anyone.
>
> I'm pretty sure it's still important. It's very easy to generate a
> _ton_ of dentries with /proc.
>
>> I wonder if instead of invalidating specific dentries we could instead
>> fire wake up a shrinker and point it at one or more instances of proc.
>
> It shouldn't be the dentries themselves that are a freeing problem.
> They're being RCU-free'd anyway because of lookup. It's the
> proc_mounts list that is the problem, isn't it?
>
> So it's just fs_info that needs to be rcu-delayed because it contains
> that list. Or is there something else?

The fundamental dcache thing we are playing with is:

	dentry = d_hash_and_lookup(proc_root, &name);
	if (dentry) {
		d_invalidate(dentry);
		dput(dentry);
	}

As Al pointed out upthread dput and d_invalidate can both sleep.

The dput can potentially go away if we use __d_lookup_rcu instead of
d_lookup.

The challenge is d_invalidate.

It has the fundamentally sleeping detach_mounts loop.  Even
shrink_dcache_parent has a cond_sched() in there to ensure it doesn't
live lock the system.

We could and arguabley should set DCACHE_CANT_MOUNT on the proc pid
dentries.  Which will prevent having to deal with mounts.

But I don't see an easy way of getting shrink_dcache_parent to run
without sleeping.  Ideas?


Eric
Al Viro Feb. 12, 2020, 7:47 p.m. UTC | #12
On Wed, Feb 12, 2020 at 10:45:06AM -0800, Linus Torvalds wrote:
> On Wed, Feb 12, 2020 at 7:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >
> > Fundamentally proc_flush_task is an optimization.  Just getting rid of
> > dentries earlier.  At least at one point it was an important
> > optimization because the old process dentries would just sit around
> > doing nothing for anyone.
> 
> I'm pretty sure it's still important. It's very easy to generate a
> _ton_ of dentries with /proc.
> 
> > I wonder if instead of invalidating specific dentries we could instead
> > fire wake up a shrinker and point it at one or more instances of proc.
> 
> It shouldn't be the dentries themselves that are a freeing problem.
> They're being RCU-free'd anyway because of lookup. It's the
> proc_mounts list that is the problem, isn't it?
> 
> So it's just fs_info that needs to be rcu-delayed because it contains
> that list. Or is there something else?

Large part of the headache is the possibility that some joker has
done something like mounting tmpfs on /proc/<pid>/map_files, or
binding /dev/null on top of /proc/<pid>/syscall, etc.

IOW, that d_invalidate() can very well have to grab namespace_sem.
And possibly do a full-blown fs shutdown of something NFS-mounted,
etc...
Linus Torvalds Feb. 12, 2020, 7:49 p.m. UTC | #13
On Wed, Feb 12, 2020 at 11:18 AM Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> > So it's just fs_info that needs to be rcu-delayed because it contains
> > that list. Or is there something else?
>
> The fundamental dcache thing we are playing with is:
>
>         dentry = d_hash_and_lookup(proc_root, &name);
>         if (dentry) {
>                 d_invalidate(dentry);
>                 dput(dentry);
>         }

Ahh. And we can't do that part under the RCU read lock. So it's not
the freeing, it's the list traversal itself.

Fair enough.

Hmm.

I wonder if we could split up d_invalidate(). It already ends up being
two phases: first the unhashing under the d_lock, and then the
recursive shrinking of parents and children.

The recursive shrinking of the parent isn't actually interesting for
the proc shrinking case: we just looked up one child, after all. So we
only care about the d_walk of the children.

So if we only did the first part under the RCU lock, and just
collected the dentries (can we perhaps then re-use the hash list to
collect them to another list?) and then did the child d_walk
afterwards?

             Linus
Al Viro Feb. 12, 2020, 8:03 p.m. UTC | #14
On Wed, Feb 12, 2020 at 11:49:58AM -0800, Linus Torvalds wrote:

> I wonder if we could split up d_invalidate(). It already ends up being
> two phases: first the unhashing under the d_lock, and then the
> recursive shrinking of parents and children.
> 
> The recursive shrinking of the parent isn't actually interesting for
> the proc shrinking case: we just looked up one child, after all. So we
> only care about the d_walk of the children.
> 
> So if we only did the first part under the RCU lock, and just
> collected the dentries (can we perhaps then re-use the hash list to
> collect them to another list?) and then did the child d_walk
> afterwards?

What's to prevent racing with fs shutdown while you are doing the second part?
We could, after all, just have them[*] on procfs-private list (anchored in
task_struct) from the very beginning; evict on ->d_prune(), walk the list
on exit...  How do you make sure the fs instance won't go away right under
you while you are doing the real work?  Suppose you are looking at one
of those dentries and you've found something blocking to do.  You can't
pin that dentry; you can pin ->s_active on its superblock (if it's already
zero, you can skip it - fs shutdown already in progress will take care of
the damn thing), but that will lead to quite a bit of cacheline pingpong...

[*] only /proc/<pid> and /proc/*/task/<pid> dentries, obviously.
Linus Torvalds Feb. 12, 2020, 8:35 p.m. UTC | #15
On Wed, Feb 12, 2020 at 12:03 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> What's to prevent racing with fs shutdown while you are doing the second part?

I was thinking that only the proc_flush_task() code would do this.

And that holds a ref to the vfsmount through upid->ns.

So I wasn't suggesting doing this in general - just splitting up the
implementation of d_invalidate() so that proc_flush_task_mnt() could
delay the complex part to after having traversed the RCU-protected
list.

But hey - I missed this part of the problem originally, so maybe I'm
just missing something else this time. Wouldn't be the first time.

               Linus
Al Viro Feb. 12, 2020, 8:38 p.m. UTC | #16
On Wed, Feb 12, 2020 at 12:35:04PM -0800, Linus Torvalds wrote:
> On Wed, Feb 12, 2020 at 12:03 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > What's to prevent racing with fs shutdown while you are doing the second part?
> 
> I was thinking that only the proc_flush_task() code would do this.
> 
> And that holds a ref to the vfsmount through upid->ns.
> 
> So I wasn't suggesting doing this in general - just splitting up the
> implementation of d_invalidate() so that proc_flush_task_mnt() could
> delay the complex part to after having traversed the RCU-protected
> list.
> 
> But hey - I missed this part of the problem originally, so maybe I'm
> just missing something else this time. Wouldn't be the first time.

Wait, I thought the whole point of that had been to allow multiple
procfs instances for the same userns?  Confused...
Al Viro Feb. 12, 2020, 8:41 p.m. UTC | #17
On Wed, Feb 12, 2020 at 08:38:33PM +0000, Al Viro wrote:
> On Wed, Feb 12, 2020 at 12:35:04PM -0800, Linus Torvalds wrote:
> > On Wed, Feb 12, 2020 at 12:03 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > What's to prevent racing with fs shutdown while you are doing the second part?
> > 
> > I was thinking that only the proc_flush_task() code would do this.
> > 
> > And that holds a ref to the vfsmount through upid->ns.
> > 
> > So I wasn't suggesting doing this in general - just splitting up the
> > implementation of d_invalidate() so that proc_flush_task_mnt() could
> > delay the complex part to after having traversed the RCU-protected
> > list.
> > 
> > But hey - I missed this part of the problem originally, so maybe I'm
> > just missing something else this time. Wouldn't be the first time.
> 
> Wait, I thought the whole point of that had been to allow multiple
> procfs instances for the same userns?  Confused...

s/userns/pidns/, sorry
Linus Torvalds Feb. 12, 2020, 9:02 p.m. UTC | #18
On Wed, Feb 12, 2020 at 12:41 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Wed, Feb 12, 2020 at 08:38:33PM +0000, Al Viro wrote:
> >
> > Wait, I thought the whole point of that had been to allow multiple
> > procfs instances for the same userns?  Confused...
>
> s/userns/pidns/, sorry

Right, but we still hold the ref to it here...

[ Looks more ]

Oooh. No we don't. Exactly because we don't hold the lock, only the
rcu lifetime, the ref can go away from under us. I see what your
concern is.

Ouch, this is more painful than I expected - the code flow looked so
simple. I really wanted to avoid a new lock during process shutdown,
because that has always been somewhat painful.

            Linus
Eric W. Biederman Feb. 12, 2020, 9:46 p.m. UTC | #19
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Wed, Feb 12, 2020 at 12:41 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>>
>> On Wed, Feb 12, 2020 at 08:38:33PM +0000, Al Viro wrote:
>> >
>> > Wait, I thought the whole point of that had been to allow multiple
>> > procfs instances for the same userns?  Confused...
>>
>> s/userns/pidns/, sorry
>
> Right, but we still hold the ref to it here...
>
> [ Looks more ]
>
> Oooh. No we don't. Exactly because we don't hold the lock, only the
> rcu lifetime, the ref can go away from under us. I see what your
> concern is.
>
> Ouch, this is more painful than I expected - the code flow looked so
> simple. I really wanted to avoid a new lock during process shutdown,
> because that has always been somewhat painful.

The good news is proc_flush_task isn't exactly called from process exit.
proc_flush_task is called during zombie clean up. AKA release_task.

So proc_flush_task isn't called with any locks held, and it is
called in a context where it can sleep.

Further after proc_flush_task does it's thing the code goes
and does "write_lock_irq(&task_list_lock);"

So the code is definitely serialized to one processor already.

What would be downside of having a mutex for a list of proc superblocks?
A mutex that is taken for both reading and writing the list.

Eric
Linus Torvalds Feb. 13, 2020, 12:48 a.m. UTC | #20
On Wed, Feb 12, 2020 at 1:48 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> The good news is proc_flush_task isn't exactly called from process exit.
> proc_flush_task is called during zombie clean up. AKA release_task.

Yeah, that at least avoids some of the nasty locking while dying debug problems.

But the one I was more worried about was actually the lock contention
issue with lots of processes. The lock is basically a single global
lock in many situations - yes, it's technically per-ns, but in a lot
of cases you really only have one namespace anyway.

And we've had problems with global locks in this area before, notably
the one you call out:

> Further after proc_flush_task does it's thing the code goes
> and does "write_lock_irq(&task_list_lock);"

Yeah, so it's not introducing a new issue, but it is potentially
making something we already know is bad even worse.

> What would be downside of having a mutex for a list of proc superblocks?
> A mutex that is taken for both reading and writing the list.

That's what the original patch actually was, and I was hoping we could
avoid that thing.

An rwsem would be possibly better, since most cases by far are likely
about reading.

And yes, I'm very aware of the task_list_lock, but it's literally why
I don't want to make a new one.

I'm _hoping_ we can some day come up with something better than task_list_lock.

            Linus
Eric W. Biederman Feb. 13, 2020, 4:37 a.m. UTC | #21
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Wed, Feb 12, 2020 at 1:48 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> The good news is proc_flush_task isn't exactly called from process exit.
>> proc_flush_task is called during zombie clean up. AKA release_task.
>
> Yeah, that at least avoids some of the nasty locking while dying debug problems.
>
> But the one I was more worried about was actually the lock contention
> issue with lots of processes. The lock is basically a single global
> lock in many situations - yes, it's technically per-ns, but in a lot
> of cases you really only have one namespace anyway.
>
> And we've had problems with global locks in this area before, notably
> the one you call out:
>
>> Further after proc_flush_task does it's thing the code goes
>> and does "write_lock_irq(&task_list_lock);"
>
> Yeah, so it's not introducing a new issue, but it is potentially
> making something we already know is bad even worse.
>
>> What would be downside of having a mutex for a list of proc superblocks?
>> A mutex that is taken for both reading and writing the list.
>
> That's what the original patch actually was, and I was hoping we could
> avoid that thing.
>
> An rwsem would be possibly better, since most cases by far are likely
> about reading.
>
> And yes, I'm very aware of the task_list_lock, but it's literally why
> I don't want to make a new one.
>
> I'm _hoping_ we can some day come up with something better than
> task_list_lock.

Yes.  I understand that.

I occassionally play with ideas, and converted all of proc to rcu
to help with situation but I haven't come up with anything clearly
better.


All of this is why I was really hoping we could have a change in
strategy and see if we can make the shrinker be able to better prune
proc inodes.



I think I have an alternate idea that could work.  Add some extra code
into proc_task_readdir, that would look for dentries that no longer
point to tasks and d_invalidate them.  With the same logic probably
being called from a few more places as well like proc_pid_readdir,
proc_task_lookup, and proc_pid_lookup.

We could even optimize it and have a process died flag we set in the
superblock.

That would would batch up the freeing work until the next time someone
reads from proc in a way that would create more dentries.  So it would
prevent dentries from reaped zombies from growing without bound.

Hmm.  Given the existence of proc_fill_cache it would really be a good
idea if readdir and lookup performed some of the freeing work as well.
As on readdir we always populate the dcache for all of the directory
entries.

I am booked solid for the next little while but if no one beats me to it
I will try and code something like that up where at least readdir
looks for and invalidates stale dentries.

Eric
Al Viro Feb. 13, 2020, 5:55 a.m. UTC | #22
On Wed, Feb 12, 2020 at 10:37:52PM -0600, Eric W. Biederman wrote:

> I think I have an alternate idea that could work.  Add some extra code
> into proc_task_readdir, that would look for dentries that no longer
> point to tasks and d_invalidate them.  With the same logic probably
> being called from a few more places as well like proc_pid_readdir,
> proc_task_lookup, and proc_pid_lookup.
> 
> We could even optimize it and have a process died flag we set in the
> superblock.
> 
> That would would batch up the freeing work until the next time someone
> reads from proc in a way that would create more dentries.  So it would
> prevent dentries from reaped zombies from growing without bound.
> 
> Hmm.  Given the existence of proc_fill_cache it would really be a good
> idea if readdir and lookup performed some of the freeing work as well.
> As on readdir we always populate the dcache for all of the directory
> entries.

First of all, that won't do a damn thing when nobody is accessing
given superblock.  What's more, readdir in root of that procfs instance
is not enough - you need it in task/ of group leader.

What I don't understand is the insistence on getting those dentries
via dcache lookups.  _IF_ we are willing to live with cacheline
contention (on ->d_lock of root dentry, if nothing else), why not
do the following:
	* put all dentries of such directories ([0-9]* and [0-9]*/task/*)
into a list anchored in task_struct; have non-counting reference to
task_struct stored in them (might simplify part of get_proc_task() users,
BTW - avoids pid-to-task_struct lookups if we have a dentry and not just
the inode; many callers do)
	* have ->d_release() remove from it (protecting per-task_struct lock
nested outside of all ->d_lock)
	* on exit:
	lock the (per-task_struct) list
	while list is non-empty
		pick the first dentry
		remove from the list
		sb = dentry->d_sb
		try to bump sb->s_active (if non-zero, that is).
		if failed
			continue // move on to the next one - nothing to do here
		grab ->d_lock
		res = handle_it(dentry, &temp_list)
		drop ->d_lock
		unlock the list
		if (!list_empty(&temp_list))
			shrink_dentry_list(&temp_list)
		if (res)
			d_invalidate(dentry)
			dput(dentry)
		deactivate_super(sb)
		lock the list
	unlock the list

handle_it(dentry, temp_list) // ->d_lock held; that one should be in dcache.c
	if ->d_count is negative // unlikely
		return 0;
	if ->d_count is positive,
		increment ->d_count
		return 1;
	// OK, it's still alive, but ->d_count is 0
	__d_drop	// equivalent of d_invalidate in this case
	if not on a shrink list // otherwise it's not our headache
		if on lru list
			d_lru_del
		d_shrink_add dentry to temp_list
	return 0;

And yeah, that'll dirty ->s_active for each procfs superblock that
has dentry for our process present in dcache.  On exit()...
Linus Torvalds Feb. 13, 2020, 9:30 p.m. UTC | #23
On Wed, Feb 12, 2020 at 9:55 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> What I don't understand is the insistence on getting those dentries
> via dcache lookups.

I don't think that's an "insistence", it's more of a "historical
behavior" together with "several changes over the years to deal with
dentry-level cleanups and updates".

> _IF_ we are willing to live with cacheline
> contention (on ->d_lock of root dentry, if nothing else), why not
> do the following:
>         * put all dentries of such directories ([0-9]* and [0-9]*/task/*)
> into a list anchored in task_struct; have non-counting reference to
> task_struct stored in them (might simplify part of get_proc_task() users,

Hmm.

Right now I don't think we actually create any dentries at all for the
short-lived process case.

Wouldn't your suggestion make fork/exit rather worse?

Or would you create the dentries dynamically still at lookup time, and
then attach them to the process at that point?

What list would you use for the dentry chaining? Would you play games
with the dentry hashing, and "hash" them off the process, and never
hit in the lookup cache?

Am I misunderstanding what you suggest?

            Linus
Al Viro Feb. 13, 2020, 10:23 p.m. UTC | #24
On Thu, Feb 13, 2020 at 01:30:11PM -0800, Linus Torvalds wrote:
> On Wed, Feb 12, 2020 at 9:55 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > What I don't understand is the insistence on getting those dentries
> > via dcache lookups.
> 
> I don't think that's an "insistence", it's more of a "historical
> behavior" together with "several changes over the years to deal with
> dentry-level cleanups and updates".
> 
> > _IF_ we are willing to live with cacheline
> > contention (on ->d_lock of root dentry, if nothing else), why not
> > do the following:
> >         * put all dentries of such directories ([0-9]* and [0-9]*/task/*)
> > into a list anchored in task_struct; have non-counting reference to
> > task_struct stored in them (might simplify part of get_proc_task() users,
> 
> Hmm.
> 
> Right now I don't think we actually create any dentries at all for the
> short-lived process case.
> 
> Wouldn't your suggestion make fork/exit rather worse?
> 
> Or would you create the dentries dynamically still at lookup time, and
> then attach them to the process at that point?
> 
> What list would you use for the dentry chaining? Would you play games
> with the dentry hashing, and "hash" them off the process, and never
> hit in the lookup cache?

I'd been thinking of ->d_fsdata pointing to a structure with list_head
and a (non-counting) task_struct pointer for those guys.  Allocated
on lookup, of course (as well as readdir ;-/) and put on the list
at the same time.

IOW, for short-lived process we simply have an empty (h)list anchored
in task_struct and that's it.
Linus Torvalds Feb. 13, 2020, 10:47 p.m. UTC | #25
On Thu, Feb 13, 2020 at 2:23 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> I'd been thinking of ->d_fsdata pointing to a structure with list_head
> and a (non-counting) task_struct pointer for those guys.  Allocated
> on lookup, of course (as well as readdir ;-/) and put on the list
> at the same time.

Hmm. That smells like potentially a lot of small allocations, and
making readdir() even nastier.

Do we really want to create the dentries at readdir time? We do now
(with proc_fill_cache()) but do we actually _need_ to?

I guess a lot of readdir users end up doing a stat on it immediately
afterwards. I think right now we do it to get the inode number, and
maybe that is a basic requirement (even if I don't think it's really
stable - an inode could be evicted and then the ino changes, no?)

Ho humm. This all doesn't make me happy. But I guess the proof is in
the pudding - and if you come up with a good patch, I won't complain.

              Linus
Eric W. Biederman Feb. 14, 2020, 3:48 a.m. UTC | #26
Al Viro <viro@zeniv.linux.org.uk> writes:

> On Wed, Feb 12, 2020 at 10:37:52PM -0600, Eric W. Biederman wrote:
>
>> I think I have an alternate idea that could work.  Add some extra code
>> into proc_task_readdir, that would look for dentries that no longer
>> point to tasks and d_invalidate them.  With the same logic probably
>> being called from a few more places as well like proc_pid_readdir,
>> proc_task_lookup, and proc_pid_lookup.
>> 
>> We could even optimize it and have a process died flag we set in the
>> superblock.
>> 
>> That would would batch up the freeing work until the next time someone
>> reads from proc in a way that would create more dentries.  So it would
>> prevent dentries from reaped zombies from growing without bound.
>> 
>> Hmm.  Given the existence of proc_fill_cache it would really be a good
>> idea if readdir and lookup performed some of the freeing work as well.
>> As on readdir we always populate the dcache for all of the directory
>> entries.
>
> First of all, that won't do a damn thing when nobody is accessing
> given superblock.  What's more, readdir in root of that procfs instance
> is not enough - you need it in task/ of group leader.

It should give a rough bound on the number of stale dentries a
superblock can have.  The same basic concept has been used very
successfully in many incremental garbage collectors.  In those malloc
(or the equivalent) does a finite amount of garbage collection work to
roughly balance out the amount of memory allocated.  I am proposing
something similar for proc instances.

Further if no one is accessing a superblock we don't have a problem
either.


> What I don't understand is the insistence on getting those dentries
> via dcache lookups.  _IF_ we are willing to live with cacheline
> contention (on ->d_lock of root dentry, if nothing else), why not
> do the following:

No insistence from this side.

I was not seeing atomic_inc_not_zero(sb->s_active) from rcu
context as option earlier.  But it is an option.

> 	* put all dentries of such directories ([0-9]* and [0-9]*/task/*)
> into a list anchored in task_struct; have non-counting reference to
> task_struct stored in them (might simplify part of get_proc_task() users,
> BTW - avoids pid-to-task_struct lookups if we have a dentry and not just
> the inode; many callers do)
> 	* have ->d_release() remove from it (protecting per-task_struct lock
> nested outside of all ->d_lock)
> 	* on exit:
> 	lock the (per-task_struct) list
> 	while list is non-empty
> 		pick the first dentry
> 		remove from the list
> 		sb = dentry->d_sb
> 		try to bump sb->s_active (if non-zero, that is).
> 		if failed
> 			continue // move on to the next one - nothing to do here
> 		grab ->d_lock
> 		res = handle_it(dentry, &temp_list)
> 		drop ->d_lock
> 		unlock the list
> 		if (!list_empty(&temp_list))
> 			shrink_dentry_list(&temp_list)
> 		if (res)
> 			d_invalidate(dentry)
> 			dput(dentry)
> 		deactivate_super(sb)
> 		lock the list
> 	unlock the list
>
> handle_it(dentry, temp_list) // ->d_lock held; that one should be in dcache.c
> 	if ->d_count is negative // unlikely
> 		return 0;
> 	if ->d_count is positive,
> 		increment ->d_count
> 		return 1;
> 	// OK, it's still alive, but ->d_count is 0
> 	__d_drop	// equivalent of d_invalidate in this case
> 	if not on a shrink list // otherwise it's not our headache
> 		if on lru list
> 			d_lru_del
> 		d_shrink_add dentry to temp_list
> 	return 0;
>
> And yeah, that'll dirty ->s_active for each procfs superblock that
> has dentry for our process present in dcache.  On exit()...


I would thread the whole thing through the proc_inode instead of coming
up with a new allocation per dentry so an extra memory allocation isn't
needed.  We already have i_dentry.  So going from the vfs_inode to
the dentry is trivial.



But truthfully I don't like proc_flush_task.

The problem is that proc_flush_task is a layering violation and magic
code that pretty much no one understands.  We have some very weird
cases where dput or d_invalidate wound up triggering ext3 code.  It has
been fixed for a long time now, but it wasy crazy weird unexpected
stuff.


Al your logic above just feels very clever, and like many pieces of the
kernel have to know how other pieces of the kernel work.  If we can find
something stupid and simple that also solves the problem I would be much
happier.   Than anyone could understand and fix it if something goes
wrong.

Eric
Eric W. Biederman Feb. 14, 2020, 3:49 a.m. UTC | #27
Al Viro <viro@zeniv.linux.org.uk> writes:

> On Wed, Feb 12, 2020 at 12:35:04PM -0800, Linus Torvalds wrote:
>> On Wed, Feb 12, 2020 at 12:03 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>> >
>> > What's to prevent racing with fs shutdown while you are doing the second part?
>> 
>> I was thinking that only the proc_flush_task() code would do this.
>> 
>> And that holds a ref to the vfsmount through upid->ns.
>> 
>> So I wasn't suggesting doing this in general - just splitting up the
>> implementation of d_invalidate() so that proc_flush_task_mnt() could
>> delay the complex part to after having traversed the RCU-protected
>> list.
>> 
>> But hey - I missed this part of the problem originally, so maybe I'm
>> just missing something else this time. Wouldn't be the first time.
>
> Wait, I thought the whole point of that had been to allow multiple
> procfs instances for the same userns?  Confused...

Multiple procfs instances for the same pidns.  Exactly.

Which would let people have their own set of procfs mount
options without having to worry about stomping on someone else.

The fundamental problem with multiple procfs instances per pidns
is there isn't an obvous place to put a vfs mount.


...


Which means we need some way to keep the file system from going away
while anyone in the kernel is running proc_flush_task.

One was I can see to solve this that would give us cheap readers, is to
have a percpu count of the number of processes in proc_flush_task.
That would work something like mnt_count.

Then forbid proc_kill_sb from removing any super block from the list
or otherwise making progress until the proc_flush_task_count goes
to zero.


f we wanted cheap readers and an expensive writer
kind of flag that proc_kill_sb can

Thinking out loud perhaps we have add a list_head on task_struct
and a list_head in proc_inode.  That would let us find the inodes
and by extention the dentries we care about quickly.

Then in evict_inode we could remove the proc_inode from the list.


Eric
Eric W. Biederman Feb. 14, 2020, 2:15 p.m. UTC | #28
Linus Torvalds <torvalds@linux-foundation.org> writes:

> I guess a lot of readdir users end up doing a stat on it immediately
> afterwards. I think right now we do it to get the inode number, and
> maybe that is a basic requirement (even if I don't think it's really
> stable - an inode could be evicted and then the ino changes, no?)

All I know is proc_fill_cache seemed like a good idea at the time.
I may have been to clever.

While I think proc_fill_cache probably exacerbates the issue
it isn't the reason we have the flushing logic.  The proc
flushing logic was introduced in around 2.5.9 much earlier
than the other proc things.

commit 0030633355db2bba32d97655df73b04215018ab9
Author: Alexander Viro <viro@math.psu.edu>
Date:   Sun Apr 21 23:03:37 2002 -0700

    [PATCH] (3/5) sane procfs/dcache interaction
    
     - sane dentry retention.  Namely, we don't kill /proc/<pid> dentries at the
       first opportunity (as the current tree does).  Instead we do the following:
            * ->d_delete() kills it only if process is already dead.
            * all ->lookup() in proc/base.c end with checking if process is still
              alive and unhash if it isn't.
            * proc_pid_lookup() (lookup for /proc/<pid>) caches reference to dentry
              in task_struct.  It's _not_ counted in ->d_count.
            * ->d_iput() resets said reference to NULL.
            * release_task() (burying a zombie) checks if there is a cached
              reference and if there is - shrinks the subtree.
            * tasklist_lock is used for exclusion.
       That way we are guaranteed that after release_task() all dentries in
       /proc/<pid> will go away as soon as possible; OTOH, before release_task()
       we have normal retention policy - they go away under memory pressure with
       the same rules as for dentries on any other fs.

Tracking down when this logic was introduced I also see that this code
has broken again and again any time proc changes (like now).  So it is
definitely subtle and fragile.

Eric
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ccb280a3e79..24b7c620ded3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3133,7 +3133,7 @@  static const struct inode_operations proc_tgid_base_inode_operations = {
 	.permission	= proc_pid_permission,
 };
 
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
+static void proc_flush_task_mnt_root(struct dentry *mnt_root, pid_t pid, pid_t tgid)
 {
 	struct dentry *dentry, *leader, *dir;
 	char buf[10 + 1];
@@ -3142,7 +3142,7 @@  static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 	name.name = buf;
 	name.len = snprintf(buf, sizeof(buf), "%u", pid);
 	/* no ->d_hash() rejects on procfs */
-	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
+	dentry = d_hash_and_lookup(mnt_root, &name);
 	if (dentry) {
 		d_invalidate(dentry);
 		dput(dentry);
@@ -3153,7 +3153,7 @@  static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 
 	name.name = buf;
 	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
-	leader = d_hash_and_lookup(mnt->mnt_root, &name);
+	leader = d_hash_and_lookup(mnt_root, &name);
 	if (!leader)
 		goto out;
 
@@ -3208,14 +3208,24 @@  void proc_flush_task(struct task_struct *task)
 	int i;
 	struct pid *pid, *tgid;
 	struct upid *upid;
+	struct dentry *mnt_root;
+	struct proc_fs_info *fs_info;
 
 	pid = task_pid(task);
 	tgid = task_tgid(task);
 
 	for (i = 0; i <= pid->level; i++) {
 		upid = &pid->numbers[i];
-		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
-					tgid->numbers[i].nr);
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(fs_info, &upid->ns->proc_mounts, pidns_entry) {
+			mnt_root = fs_info->m_super->s_root;
+			proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
+		}
+		rcu_read_unlock();
+
+		mnt_root = upid->ns->proc_mnt->mnt_root;
+		proc_flush_task_mnt_root(mnt_root, upid->nr, tgid->numbers[i].nr);
 	}
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5d5cba4c899b..e2bb015da1a8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,7 +149,22 @@  static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	if (ret) {
 		return ret;
 	}
-	return proc_setup_thread_self(s);
+
+	ret = proc_setup_thread_self(s);
+	if (ret) {
+		return ret;
+	}
+
+	/*
+	 * back reference to flush dcache entries at process exit time.
+	 */
+	ctx->fs_info->m_super = s;
+
+	spin_lock(&pid_ns->proc_mounts_lock);
+	list_add_tail_rcu(&ctx->fs_info->pidns_entry, &pid_ns->proc_mounts);
+	spin_unlock(&pid_ns->proc_mounts_lock);
+
+	return 0;
 }
 
 static int proc_reconfigure(struct fs_context *fc)
@@ -211,10 +226,17 @@  static void proc_kill_sb(struct super_block *sb)
 {
 	struct proc_fs_info *fs_info = proc_sb_info(sb);
 
+	spin_lock(&fs_info->pid_ns->proc_mounts_lock);
+	list_del_rcu(&fs_info->pidns_entry);
+	spin_unlock(&fs_info->pid_ns->proc_mounts_lock);
+
+	synchronize_rcu();
+
 	if (fs_info->proc_self)
 		dput(fs_info->proc_self);
 	if (fs_info->proc_thread_self)
 		dput(fs_info->proc_thread_self);
+
 	kill_anon_super(sb);
 	put_pid_ns(fs_info->pid_ns);
 	kfree(fs_info);
@@ -336,6 +358,9 @@  int pid_ns_prepare_proc(struct pid_namespace *ns)
 		ctx->fs_info->pid_ns = ns;
 	}
 
+	spin_lock_init(&ns->proc_mounts_lock);
+	INIT_LIST_HEAD_RCU(&ns->proc_mounts);
+
 	mnt = fc_mount(fc);
 	put_fs_context(fc);
 	if (IS_ERR(mnt))
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 66f47f1afe0d..c36af1dfd862 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -26,6 +26,8 @@  struct pid_namespace {
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt; /* Internal proc mounted during each new pidns */
+	spinlock_t proc_mounts_lock;
+	struct list_head proc_mounts; /* list of separated procfs mounts */
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct fs_pin *bacct;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 5f0b1b7e4271..f307940f8311 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -20,6 +20,8 @@  enum {
 };
 
 struct proc_fs_info {
+	struct list_head pidns_entry;    /* Node in procfs_mounts of a pidns */
+	struct super_block *m_super;
 	struct pid_namespace *pid_ns;
 	struct dentry *proc_self;        /* For /proc/self */
 	struct dentry *proc_thread_self; /* For /proc/thread-self */