diff mbox series

[v2,5/6] proc: use down_read_killable mmap_sem for /proc/pid/map_files

Message ID 156007493995.3335.9595044802115356911.stgit@buzz (mailing list archive)
State New, archived
Headers show
Series mm: use down_read_killable for locking mmap_sem in proc | expand

Commit Message

Konstantin Khlebnikov June 9, 2019, 10:09 a.m. UTC
Do not stuck forever if something wrong.
Killable lock allows to cleanup stuck tasks and simplifies investigation.

It seems ->d_revalidate() could return any error (except ECHILD) to
abort validation and pass error as result of lookup sequence.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 fs/proc/base.c |   27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

Comments

Andrei Vagin June 12, 2019, 11:14 p.m. UTC | #1
On Sun, Jun 09, 2019 at 01:09:00PM +0300, Konstantin Khlebnikov wrote:
> Do not stuck forever if something wrong.
> Killable lock allows to cleanup stuck tasks and simplifies investigation.

This patch breaks the CRIU project, because stat() returns EINTR instead
of ENOENT:

[root@fc24 criu]# stat /proc/self/map_files/0-0
stat: cannot stat '/proc/self/map_files/0-0': Interrupted system call

Here is one inline comment with the fix for this issue.

> 
> It seems ->d_revalidate() could return any error (except ECHILD) to
> abort validation and pass error as result of lookup sequence.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Reviewed-by: Roman Gushchin <guro@fb.com>
> Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
> Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>

It was nice to see all four of you in one place :).

> Acked-by: Michal Hocko <mhocko@suse.com>
> ---
>  fs/proc/base.c |   27 +++++++++++++++++++++------
>  1 file changed, 21 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 9c8ca6cd3ce4..515ab29c2adf 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1962,9 +1962,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
>  		goto out;
>  
>  	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
> -		down_read(&mm->mmap_sem);
> -		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
> -		up_read(&mm->mmap_sem);
> +		status = down_read_killable(&mm->mmap_sem);
> +		if (!status) {
> +			exact_vma_exists = !!find_exact_vma(mm, vm_start,
> +							    vm_end);
> +			up_read(&mm->mmap_sem);
> +		}
>  	}
>  
>  	mmput(mm);
> @@ -2010,8 +2013,11 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
>  	if (rc)
>  		goto out_mmput;
>  
> +	rc = down_read_killable(&mm->mmap_sem);
> +	if (rc)
> +		goto out_mmput;
> +
>  	rc = -ENOENT;
> -	down_read(&mm->mmap_sem);
>  	vma = find_exact_vma(mm, vm_start, vm_end);
>  	if (vma && vma->vm_file) {
>  		*path = vma->vm_file->f_path;
> @@ -2107,7 +2113,10 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
>  	if (!mm)
>  		goto out_put_task;
>  
> -	down_read(&mm->mmap_sem);
> +	result = ERR_PTR(-EINTR);
> +	if (down_read_killable(&mm->mmap_sem))
> +		goto out_put_mm;
> +

	result = ERR_PTR(-ENOENT);

>  	vma = find_exact_vma(mm, vm_start, vm_end);
>  	if (!vma)
>  		goto out_no_vma;
> @@ -2118,6 +2127,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
>  
>  out_no_vma:
>  	up_read(&mm->mmap_sem);
> +out_put_mm:
>  	mmput(mm);
>  out_put_task:
>  	put_task_struct(task);
> @@ -2160,7 +2170,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
>  	mm = get_task_mm(task);
>  	if (!mm)
>  		goto out_put_task;
> -	down_read(&mm->mmap_sem);
> +
> +	ret = down_read_killable(&mm->mmap_sem);
> +	if (ret) {
> +		mmput(mm);
> +		goto out_put_task;
> +	}
>  
>  	nr_files = 0;
>
Andrew Morton June 13, 2019, 12:04 a.m. UTC | #2
On Wed, 12 Jun 2019 16:14:28 -0700 Andrei Vagin <avagin@gmail.com> wrote:

> On Sun, Jun 09, 2019 at 01:09:00PM +0300, Konstantin Khlebnikov wrote:
> > Do not stuck forever if something wrong.
> > Killable lock allows to cleanup stuck tasks and simplifies investigation.
> 
> This patch breaks the CRIU project, because stat() returns EINTR instead
> of ENOENT:
> 
> [root@fc24 criu]# stat /proc/self/map_files/0-0
> stat: cannot stat '/proc/self/map_files/0-0': Interrupted system call
> 
> Here is one inline comment with the fix for this issue.
> 
> > @@ -2107,7 +2113,10 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
> >  	if (!mm)
> >  		goto out_put_task;
> >  
> > -	down_read(&mm->mmap_sem);
> > +	result = ERR_PTR(-EINTR);
> > +	if (down_read_killable(&mm->mmap_sem))
> > +		goto out_put_mm;
> > +
> 
> 	result = ERR_PTR(-ENOENT);
> 

yes, thanks.

--- a/fs/proc/base.c~proc-use-down_read_killable-mmap_sem-for-proc-pid-map_files-fix
+++ a/fs/proc/base.c
@@ -2117,6 +2117,7 @@ static struct dentry *proc_map_files_loo
 	if (down_read_killable(&mm->mmap_sem))
 		goto out_put_mm;
 
+	result = ERR_PTR(-ENOENT);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (!vma)
 		goto out_no_vma;



We started doing this trick of using

	ret = -EFOO;
	if (something)
		goto out;

decades ago because it generated slightly better code.  I rather doubt
whether that's still the case.

In fact this:

--- a/fs/proc/base.c~a
+++ a/fs/proc/base.c
@@ -2096,35 +2096,45 @@ static struct dentry *proc_map_files_loo
 	struct dentry *result;
 	struct mm_struct *mm;
 
-	result = ERR_PTR(-ENOENT);
 	task = get_proc_task(dir);
-	if (!task)
+	if (!task) {
+		result = ERR_PTR(-ENOENT);
 		goto out;
+	}
 
-	result = ERR_PTR(-EACCES);
-	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+		result = ERR_PTR(-EACCES);
 		goto out_put_task;
+	}
 
-	result = ERR_PTR(-ENOENT);
-	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		result = ERR_PTR(-ENOENT);
 		goto out_put_task;
+	}
 
 	mm = get_task_mm(task);
-	if (!mm)
+	if (!mm) {
+		result = ERR_PTR(-ENOENT);
 		goto out_put_task;
+	}
 
-	result = ERR_PTR(-EINTR);
-	if (down_read_killable(&mm->mmap_sem))
+	if (down_read_killable(&mm->mmap_sem)) {
+		result = ERR_PTR(-EINTR);
 		goto out_put_mm;
+	}
 
-	result = ERR_PTR(-ENOENT);
 	vma = find_exact_vma(mm, vm_start, vm_end);
-	if (!vma)
+	if (!vma) {
+		result = ERR_PTR(-ENOENT);
 		goto out_no_vma;
+	}
 
-	if (vma->vm_file)
+	if (vma->vm_file) {
 		result = proc_map_files_instantiate(dentry, task,
 				(void *)(unsigned long)vma->vm_file->f_mode);
+	} else {
+		result = ERR_PTR(-ENOENT);
+	}
 
 out_no_vma:
 	up_read(&mm->mmap_sem);

makes no change to the generated assembly with gcc-7.2.0.

And I do think that the above style is clearer and more reliable, as
this bug demonstrates.
Cyrill Gorcunov June 13, 2019, 6:48 a.m. UTC | #3
On Wed, Jun 12, 2019 at 04:14:28PM -0700, Andrei Vagin wrote:
> On Sun, Jun 09, 2019 at 01:09:00PM +0300, Konstantin Khlebnikov wrote:
> > Do not stuck forever if something wrong.
> > Killable lock allows to cleanup stuck tasks and simplifies investigation.
> 
> This patch breaks the CRIU project, because stat() returns EINTR instead
> of ENOENT:
> 
> [root@fc24 criu]# stat /proc/self/map_files/0-0
> stat: cannot stat '/proc/self/map_files/0-0': Interrupted system call
> 
> Here is one inline comment with the fix for this issue.
> 
> > 
> > It seems ->d_revalidate() could return any error (except ECHILD) to
> > abort validation and pass error as result of lookup sequence.
> > 
> > Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> > Reviewed-by: Roman Gushchin <guro@fb.com>
> > Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
> > Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> 
> It was nice to see all four of you in one place :).

Holymoly ;) And we all managed to miss this error code.
Thanks, Andrew!
Konstantin Khlebnikov June 13, 2019, 8:15 a.m. UTC | #4
On 13.06.2019 2:14, Andrei Vagin wrote:
> On Sun, Jun 09, 2019 at 01:09:00PM +0300, Konstantin Khlebnikov wrote:
>> Do not stuck forever if something wrong.
>> Killable lock allows to cleanup stuck tasks and simplifies investigation.
> 
> This patch breaks the CRIU project, because stat() returns EINTR instead
> of ENOENT:
> 
> [root@fc24 criu]# stat /proc/self/map_files/0-0
> stat: cannot stat '/proc/self/map_files/0-0': Interrupted system call

Good catch.

It seems CRIU tests has good coverage for darkest corners of kernel API.
Kernel CI projects should use it. I suppose you know how to promote this. =)

> 
> Here is one inline comment with the fix for this issue.
> 
>>
>> It seems ->d_revalidate() could return any error (except ECHILD) to
>> abort validation and pass error as result of lookup sequence.
>>
>> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
>> Reviewed-by: Roman Gushchin <guro@fb.com>
>> Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
>> Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> 
> It was nice to see all four of you in one place :).
> 
>> Acked-by: Michal Hocko <mhocko@suse.com>
>> ---
>>   fs/proc/base.c |   27 +++++++++++++++++++++------
>>   1 file changed, 21 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 9c8ca6cd3ce4..515ab29c2adf 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -1962,9 +1962,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
>>   		goto out;
>>   
>>   	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
>> -		down_read(&mm->mmap_sem);
>> -		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
>> -		up_read(&mm->mmap_sem);
>> +		status = down_read_killable(&mm->mmap_sem);
>> +		if (!status) {
>> +			exact_vma_exists = !!find_exact_vma(mm, vm_start,
>> +							    vm_end);
>> +			up_read(&mm->mmap_sem);
>> +		}
>>   	}
>>   
>>   	mmput(mm);
>> @@ -2010,8 +2013,11 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
>>   	if (rc)
>>   		goto out_mmput;
>>   
>> +	rc = down_read_killable(&mm->mmap_sem);
>> +	if (rc)
>> +		goto out_mmput;
>> +
>>   	rc = -ENOENT;
>> -	down_read(&mm->mmap_sem);
>>   	vma = find_exact_vma(mm, vm_start, vm_end);
>>   	if (vma && vma->vm_file) {
>>   		*path = vma->vm_file->f_path;
>> @@ -2107,7 +2113,10 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
>>   	if (!mm)
>>   		goto out_put_task;
>>   
>> -	down_read(&mm->mmap_sem);
>> +	result = ERR_PTR(-EINTR);
>> +	if (down_read_killable(&mm->mmap_sem))
>> +		goto out_put_mm;
>> +
> 
> 	result = ERR_PTR(-ENOENT);
> 
>>   	vma = find_exact_vma(mm, vm_start, vm_end);
>>   	if (!vma)
>>   		goto out_no_vma;
>> @@ -2118,6 +2127,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
>>   
>>   out_no_vma:
>>   	up_read(&mm->mmap_sem);
>> +out_put_mm:
>>   	mmput(mm);
>>   out_put_task:
>>   	put_task_struct(task);
>> @@ -2160,7 +2170,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
>>   	mm = get_task_mm(task);
>>   	if (!mm)
>>   		goto out_put_task;
>> -	down_read(&mm->mmap_sem);
>> +
>> +	ret = down_read_killable(&mm->mmap_sem);
>> +	if (ret) {
>> +		mmput(mm);
>> +		goto out_put_task;
>> +	}
>>   
>>   	nr_files = 0;
>>
Andrei Vagin June 13, 2019, 8:32 p.m. UTC | #5
On Thu, Jun 13, 2019 at 1:15 AM Konstantin Khlebnikov
<khlebnikov@yandex-team.ru> wrote:
>
> On 13.06.2019 2:14, Andrei Vagin wrote:
> > On Sun, Jun 09, 2019 at 01:09:00PM +0300, Konstantin Khlebnikov wrote:
> >> Do not stuck forever if something wrong.
> >> Killable lock allows to cleanup stuck tasks and simplifies investigation.
> >
> > This patch breaks the CRIU project, because stat() returns EINTR instead
> > of ENOENT:
> >
> > [root@fc24 criu]# stat /proc/self/map_files/0-0
> > stat: cannot stat '/proc/self/map_files/0-0': Interrupted system call
>
> Good catch.
>
> It seems CRIU tests has good coverage for darkest corners of kernel API.
> Kernel CI projects should use it. I suppose you know how to promote this. =)

I remember Mike was trying to add the CRIU test suite into kernel-ci,
but it looks like this ended up with nothing.

The good thing here is that we have our own kernel-ci:
https://travis-ci.org/avagin/linux/builds

Travis-CI doesn't allow to replace the kernel, so we use CRIU to
dump/restore a ssh session and travis doesn't notice when we kexec a
new kernel.

>
> >
> > Here is one inline comment with the fix for this issue.
> >
> >>
> >> It seems ->d_revalidate() could return any error (except ECHILD) to
> >> abort validation and pass error as result of lookup sequence.
> >>
> >> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> >> Reviewed-by: Roman Gushchin <guro@fb.com>
> >> Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
> >> Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> >
> > It was nice to see all four of you in one place :).
> >
> >> Acked-by: Michal Hocko <mhocko@suse.com>
> >> ---
> >>   fs/proc/base.c |   27 +++++++++++++++++++++------
> >>   1 file changed, 21 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/fs/proc/base.c b/fs/proc/base.c
> >> index 9c8ca6cd3ce4..515ab29c2adf 100644
> >> --- a/fs/proc/base.c
> >> +++ b/fs/proc/base.c
> >> @@ -1962,9 +1962,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
> >>              goto out;
> >>
> >>      if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
> >> -            down_read(&mm->mmap_sem);
> >> -            exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
> >> -            up_read(&mm->mmap_sem);
> >> +            status = down_read_killable(&mm->mmap_sem);
> >> +            if (!status) {
> >> +                    exact_vma_exists = !!find_exact_vma(mm, vm_start,
> >> +                                                        vm_end);
> >> +                    up_read(&mm->mmap_sem);
> >> +            }
> >>      }
> >>
> >>      mmput(mm);
> >> @@ -2010,8 +2013,11 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
> >>      if (rc)
> >>              goto out_mmput;
> >>
> >> +    rc = down_read_killable(&mm->mmap_sem);
> >> +    if (rc)
> >> +            goto out_mmput;
> >> +
> >>      rc = -ENOENT;
> >> -    down_read(&mm->mmap_sem);
> >>      vma = find_exact_vma(mm, vm_start, vm_end);
> >>      if (vma && vma->vm_file) {
> >>              *path = vma->vm_file->f_path;
> >> @@ -2107,7 +2113,10 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
> >>      if (!mm)
> >>              goto out_put_task;
> >>
> >> -    down_read(&mm->mmap_sem);
> >> +    result = ERR_PTR(-EINTR);
> >> +    if (down_read_killable(&mm->mmap_sem))
> >> +            goto out_put_mm;
> >> +
> >
> >       result = ERR_PTR(-ENOENT);
> >
> >>      vma = find_exact_vma(mm, vm_start, vm_end);
> >>      if (!vma)
> >>              goto out_no_vma;
> >> @@ -2118,6 +2127,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
> >>
> >>   out_no_vma:
> >>      up_read(&mm->mmap_sem);
> >> +out_put_mm:
> >>      mmput(mm);
> >>   out_put_task:
> >>      put_task_struct(task);
> >> @@ -2160,7 +2170,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
> >>      mm = get_task_mm(task);
> >>      if (!mm)
> >>              goto out_put_task;
> >> -    down_read(&mm->mmap_sem);
> >> +
> >> +    ret = down_read_killable(&mm->mmap_sem);
> >> +    if (ret) {
> >> +            mmput(mm);
> >> +            goto out_put_task;
> >> +    }
> >>
> >>      nr_files = 0;
> >>
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9c8ca6cd3ce4..515ab29c2adf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1962,9 +1962,12 @@  static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out;
 
 	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
-		down_read(&mm->mmap_sem);
-		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
-		up_read(&mm->mmap_sem);
+		status = down_read_killable(&mm->mmap_sem);
+		if (!status) {
+			exact_vma_exists = !!find_exact_vma(mm, vm_start,
+							    vm_end);
+			up_read(&mm->mmap_sem);
+		}
 	}
 
 	mmput(mm);
@@ -2010,8 +2013,11 @@  static int map_files_get_link(struct dentry *dentry, struct path *path)
 	if (rc)
 		goto out_mmput;
 
+	rc = down_read_killable(&mm->mmap_sem);
+	if (rc)
+		goto out_mmput;
+
 	rc = -ENOENT;
-	down_read(&mm->mmap_sem);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
 		*path = vma->vm_file->f_path;
@@ -2107,7 +2113,10 @@  static struct dentry *proc_map_files_lookup(struct inode *dir,
 	if (!mm)
 		goto out_put_task;
 
-	down_read(&mm->mmap_sem);
+	result = ERR_PTR(-EINTR);
+	if (down_read_killable(&mm->mmap_sem))
+		goto out_put_mm;
+
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (!vma)
 		goto out_no_vma;
@@ -2118,6 +2127,7 @@  static struct dentry *proc_map_files_lookup(struct inode *dir,
 
 out_no_vma:
 	up_read(&mm->mmap_sem);
+out_put_mm:
 	mmput(mm);
 out_put_task:
 	put_task_struct(task);
@@ -2160,7 +2170,12 @@  proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 	mm = get_task_mm(task);
 	if (!mm)
 		goto out_put_task;
-	down_read(&mm->mmap_sem);
+
+	ret = down_read_killable(&mm->mmap_sem);
+	if (ret) {
+		mmput(mm);
+		goto out_put_task;
+	}
 
 	nr_files = 0;