diff mbox

[RFC,2/3] pipe: protect pipe_max_size access with a mutex

Message ID 1504622676-2992-3-git-send-email-joe.lawrence@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Joe Lawrence Sept. 5, 2017, 2:44 p.m. UTC
pipe_max_size is assigned directly via procfs sysctl:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)
          ...

and then later rounded in-place a few statements later:

          ...
          pipe_max_size = round_pipe_size(pipe_max_size);
          ...

This leaves a window of time between initial assignment and rounding
that may be visible to other threads.  (For example, one thread sets a
non-rounded value to pipe_max_size while another reads its value.)

Similar reads of pipe_max_size are potentially racey:

  pipe.c :: alloc_pipe_info()
  pipe.c :: pipe_set_size()

Protect them and the procfs sysctl assignment with a mutex.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
---
 fs/pipe.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

Comments

Mikulas Patocka Sept. 19, 2017, 7:53 a.m. UTC | #1
On Fri, 15 Sep 2017, Joe Lawrence wrote:

> On 09/14/2017 07:09 PM, Mikulas Patocka wrote:
> > On Tue, 5 Sep 2017, Joe Lawrence wrote:
> >> pipe_max_size is assigned directly via procfs sysctl:
> >>
> >>   static struct ctl_table fs_table[] = {
> >>           ...
> >>           {
> >>                   .procname       = "pipe-max-size",
> >>                   .data           = &pipe_max_size,
> >>                   .maxlen         = sizeof(int),
> >>                   .mode           = 0644,
> >>                   .proc_handler   = &pipe_proc_fn,
> >>                   .extra1         = &pipe_min_size,
> >>           },
> >>           ...
> >>
> >>   int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
> >>                    size_t *lenp, loff_t *ppos)
> >>   {
> >>           ...
> >>           ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)
> >>           ...
> >>
> >> and then later rounded in-place a few statements later:
> >>
> >>           ...
> >>           pipe_max_size = round_pipe_size(pipe_max_size);
> >>           ...
> >>
> >> This leaves a window of time between initial assignment and rounding
> >> that may be visible to other threads.  (For example, one thread sets a
> >> non-rounded value to pipe_max_size while another reads its value.)
> >>
> >> Similar reads of pipe_max_size are potentially racey:
> >>
> >>   pipe.c :: alloc_pipe_info()
> >>   pipe.c :: pipe_set_size()
> >>
> >> Protect them and the procfs sysctl assignment with a mutex.
> >>
> >> Reported-by: Mikulas Patocka <mpatocka@redhat.com>
> >> Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
> >> ---
> >>  fs/pipe.c | 28 ++++++++++++++++++++++++----
> >>  1 file changed, 24 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/fs/pipe.c b/fs/pipe.c
> >> index fa28910b3c59..33bb11b0d78e 100644
> >> --- a/fs/pipe.c
> >> +++ b/fs/pipe.c
> >> @@ -35,6 +35,11 @@
> >>  unsigned int pipe_max_size = 1048576;
> >>  
> >>  /*
> >> + * Provide mutual exclusion around access to pipe_max_size
> >> + */
> >> +static DEFINE_MUTEX(pipe_max_mutex);
> >> +
> >> +/*
> >>   * Minimum pipe size, as required by POSIX
> >>   */
> >>  unsigned int pipe_min_size = PAGE_SIZE;
> >> @@ -623,13 +628,18 @@ struct pipe_inode_info *alloc_pipe_info(void)
> >>  	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
> >>  	struct user_struct *user = get_current_user();
> >>  	unsigned long user_bufs;
> >> +	unsigned int max_size;
> >>  
> >>  	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
> >>  	if (pipe == NULL)
> >>  		goto out_free_uid;
> >>  
> >> -	if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE))
> >> -		pipe_bufs = pipe_max_size >> PAGE_SHIFT;
> >> +	mutex_lock(&pipe_max_mutex);
> >> +	max_size = pipe_max_size;
> >> +	mutex_unlock(&pipe_max_mutex);
> >> +
> >> +	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
> >> +		pipe_bufs = max_size >> PAGE_SHIFT;
> >>  
> >>  	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
> >>  
> >> @@ -1039,6 +1049,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
> >>  	struct pipe_buffer *bufs;
> >>  	unsigned int size, nr_pages;
> >>  	unsigned long user_bufs;
> >> +	unsigned int max_size;
> >>  	long ret = 0;
> >>  
> >>  	size = round_pipe_size(arg);
> >> @@ -1056,8 +1067,11 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
> >>  	 * Decreasing the pipe capacity is always permitted, even
> >>  	 * if the user is currently over a limit.
> >>  	 */
> >> +	mutex_lock(&pipe_max_mutex);
> >> +	max_size = pipe_max_size;
> >> +	mutex_unlock(&pipe_max_mutex);
> >>  	if (nr_pages > pipe->buffers &&
> >> -			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
> >> +			size > max_size && !capable(CAP_SYS_RESOURCE))
> >>  		return -EPERM;
> >>  
> >>  	user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
> >> @@ -1131,18 +1145,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
> >>  	unsigned int rounded_pipe_max_size;
> >>  	int ret;
> >>  
> >> +	mutex_lock(&pipe_max_mutex);
> >>  	orig_pipe_max_size = pipe_max_size;
> >>  	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
> >> -	if (ret < 0 || !write)
> >> +	if (ret < 0 || !write) {
> >> +		mutex_unlock(&pipe_max_mutex);
> >>  		return ret;
> >> +	}
> >>  
> >>  	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
> >>  	if (rounded_pipe_max_size == 0) {
> >>  		pipe_max_size = orig_pipe_max_size;
> >> +		mutex_unlock(&pipe_max_mutex);
> >>  		return -EINVAL;
> >>  	}
> >>  
> >>  	pipe_max_size = rounded_pipe_max_size;
> >> +	mutex_unlock(&pipe_max_mutex);
> >> +
> >>  	return ret;
> >>  }
> >>  
> >> -- 
> >> 1.8.3.1
> >>
> > I think this mutex is too heavy - if multiple processes simultaneously
> > create a pipe, the mutex would cause performance degradation.
> >
> > You can call do_proc_dointvec with a custom callback "conv" function that
> > does the rounding of the pipe size value.
> >
> > Mikulas
> >
> 
> Hi Mikulas,
> 
> I'm not strong when it comes to memory barriers, but one of the
> side-effects of using the mutex is that pipe_set_size() and
> alloc_pipe_info() should have a consistent view of pipe_max_size.
> 
> If I remove the mutex (and assume that I implement a custom
> do_proc_dointvec "conv" callback), is it safe for these routines to
> directly use pipe_max_size as they had done before?
>
> If not, is it safe to alias through a temporary stack variable (ie,
> could the compiler re-read pipe_max_size multiple times in the function)?
>
> Would READ_ONCE() help in any way?

Theoretically re-reading the variable is possible and you should use 
ACCESS_ONCE or READ_ONCE+WRITE_ONCE on that variable.

In practice, ACCESS_ONCE/READ_ONCE/WRITE_ONCE is missing at a lot of 
kernel variables that could be modified asynchronously and no one is 
complaining about it and no one is making any systematic effort to fix it.

That re-reading happens (I have some test code that makes the gcc 
optimizer re-read a variable), but it happens very rarely.

Another theoretical problem is that when reading or writing a variable 
without ACCESS_ONCE, the compiler could read and write the variable using 
multiple smaller memory accesses. But in practice, it happens only on some 
non-common architectures.

> The mutex covered up some confusion on my part here.
> 
> OTOH, since pipe_max_size is read-only for pipe_set_size() and
> alloc_pipe_info() and only updated occasionally by pipe_proc_fn(), would
> rw_semaphore or RCU be a fit here?

RW semaphore causes cache-line ping-pong between CPUs, it slows down the 
kernel just like a normal spinlock or mutex.

RCU would be useless here (i.e. you don't want to allocate memory and 
atomically assign it with rcu_assign_pointer).

> Regards,
> 
> -- Joe

Mikulas
Joe Lawrence Sept. 19, 2017, 9:32 p.m. UTC | #2
On 09/19/2017 03:53 AM, Mikulas Patocka wrote:
> On Fri, 15 Sep 2017, Joe Lawrence wrote:
> [ ... snip ... ]
>> Hi Mikulas,
>>
>> I'm not strong when it comes to memory barriers, but one of the
>> side-effects of using the mutex is that pipe_set_size() and
>> alloc_pipe_info() should have a consistent view of pipe_max_size.
>>
>> If I remove the mutex (and assume that I implement a custom
>> do_proc_dointvec "conv" callback), is it safe for these routines to
>> directly use pipe_max_size as they had done before?
>>
>> If not, is it safe to alias through a temporary stack variable (ie,
>> could the compiler re-read pipe_max_size multiple times in the function)?
>>
>> Would READ_ONCE() help in any way?
> 
> Theoretically re-reading the variable is possible and you should use 
> ACCESS_ONCE or READ_ONCE+WRITE_ONCE on that variable.
> 
> In practice, ACCESS_ONCE/READ_ONCE/WRITE_ONCE is missing at a lot of 
> kernel variables that could be modified asynchronously and no one is 
> complaining about it and no one is making any systematic effort to fix it.
> 
> That re-reading happens (I have some test code that makes the gcc 
> optimizer re-read a variable), but it happens very rarely.

This would be interesting to look at if you are willing to share (can
send offlist).

> Another theoretical problem is that when reading or writing a variable 
> without ACCESS_ONCE, the compiler could read and write the variable using 
> multiple smaller memory accesses. But in practice, it happens only on some 
> non-common architectures.

Smaller access than word size?

>> The mutex covered up some confusion on my part here.
>>
>> OTOH, since pipe_max_size is read-only for pipe_set_size() and
>> alloc_pipe_info() and only updated occasionally by pipe_proc_fn(), would
>> rw_semaphore or RCU be a fit here?
> 
> RW semaphore causes cache-line ping-pong between CPUs, it slows down the 
> kernel just like a normal spinlock or mutex.

Ah right.

> RCU would be useless here (i.e. you don't want to allocate memory and 
> atomically assign it with rcu_assign_pointer).

And good point here.

Thanks for the explanations, they confirm and expand what I as already
thinking in this space.

--- Joe
Mikulas Patocka Sept. 21, 2017, 10:05 a.m. UTC | #3
On Tue, 19 Sep 2017, Joe Lawrence wrote:

> On 09/19/2017 03:53 AM, Mikulas Patocka wrote:
> > On Fri, 15 Sep 2017, Joe Lawrence wrote:
> > [ ... snip ... ]
> >> Hi Mikulas,
> >>
> >> I'm not strong when it comes to memory barriers, but one of the
> >> side-effects of using the mutex is that pipe_set_size() and
> >> alloc_pipe_info() should have a consistent view of pipe_max_size.
> >>
> >> If I remove the mutex (and assume that I implement a custom
> >> do_proc_dointvec "conv" callback), is it safe for these routines to
> >> directly use pipe_max_size as they had done before?
> >>
> >> If not, is it safe to alias through a temporary stack variable (ie,
> >> could the compiler re-read pipe_max_size multiple times in the function)?
> >>
> >> Would READ_ONCE() help in any way?
> > 
> > Theoretically re-reading the variable is possible and you should use 
> > ACCESS_ONCE or READ_ONCE+WRITE_ONCE on that variable.
> > 
> > In practice, ACCESS_ONCE/READ_ONCE/WRITE_ONCE is missing at a lot of 
> > kernel variables that could be modified asynchronously and no one is 
> > complaining about it and no one is making any systematic effort to fix it.
> > 
> > That re-reading happens (I have some test code that makes the gcc 
> > optimizer re-read a variable), but it happens very rarely.
> 
> This would be interesting to look at if you are willing to share (can
> send offlist).

struct s {
        unsigned a, b, c, d;
};

unsigned fn(struct s *s)
{
        unsigned a = s->a;
        s->b = a;
        asm("nop":::"ebx","ecx","edx","esi","edi","ebp");
        s->c = a;
        return s->d;
}

This piece of code makes gcc read the variable s->a twice (although it is 
read only once in the source code). Compile it with -m32 -O2. The result 
is this:

00000000 <fn>:
   0:   55                      push   %ebp
   1:   57                      push   %edi
   2:   56                      push   %esi
   3:   53                      push   %ebx
   4:   8b 44 24 14             mov    0x14(%esp),%eax
   8:   8b 10                   mov    (%eax),%edx   <--- 1st load of s->a
   a:   89 50 04                mov    %edx,0x4(%eax)
   d:   90                      nop
   e:   8b 08                   mov    (%eax),%ecx   <--- 2nd load of s->a
  10:   89 48 08                mov    %ecx,0x8(%eax)
  13:   8b 40 0c                mov    0xc(%eax),%eax
  16:   5b                      pop    %ebx
  17:   5e                      pop    %esi
  18:   5f                      pop    %edi
  19:   5d                      pop    %ebp
  1a:   c3                      ret


> > Another theoretical problem is that when reading or writing a variable 
> > without ACCESS_ONCE, the compiler could read and write the variable using 
> > multiple smaller memory accesses. But in practice, it happens only on some 
> > non-common architectures.
> 
> Smaller access than word size?

Yes. The file Documentation/memory-barriers.txt says:
 (*) For aligned memory locations whose size allows them to be accessed
     with a single memory-reference instruction, prevents "load tearing"
     and "store tearing," in which a single large access is replaced by
     multiple smaller accesses.  For example, given an architecture having
     16-bit store instructions with 7-bit immediate fields, the compiler
     might be tempted to use two 16-bit store-immediate instructions to
     implement the following 32-bit store:

        p = 0x00010002;

     Please note that GCC really does use this sort of optimization,
     which is not surprising given that it would likely take more
     than two instructions to build the constant and then store it.

But it doesn't say on which architecture gcc does it.

Mikulas
diff mbox

Patch

diff --git a/fs/pipe.c b/fs/pipe.c
index fa28910b3c59..33bb11b0d78e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -35,6 +35,11 @@ 
 unsigned int pipe_max_size = 1048576;
 
 /*
+ * Provide mutual exclusion around access to pipe_max_size
+ */
+static DEFINE_MUTEX(pipe_max_mutex);
+
+/*
  * Minimum pipe size, as required by POSIX
  */
 unsigned int pipe_min_size = PAGE_SIZE;
@@ -623,13 +628,18 @@  struct pipe_inode_info *alloc_pipe_info(void)
 	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 	struct user_struct *user = get_current_user();
 	unsigned long user_bufs;
+	unsigned int max_size;
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 	if (pipe == NULL)
 		goto out_free_uid;
 
-	if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE))
-		pipe_bufs = pipe_max_size >> PAGE_SHIFT;
+	mutex_lock(&pipe_max_mutex);
+	max_size = pipe_max_size;
+	mutex_unlock(&pipe_max_mutex);
+
+	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
+		pipe_bufs = max_size >> PAGE_SHIFT;
 
 	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
 
@@ -1039,6 +1049,7 @@  static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	struct pipe_buffer *bufs;
 	unsigned int size, nr_pages;
 	unsigned long user_bufs;
+	unsigned int max_size;
 	long ret = 0;
 
 	size = round_pipe_size(arg);
@@ -1056,8 +1067,11 @@  static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	 * Decreasing the pipe capacity is always permitted, even
 	 * if the user is currently over a limit.
 	 */
+	mutex_lock(&pipe_max_mutex);
+	max_size = pipe_max_size;
+	mutex_unlock(&pipe_max_mutex);
 	if (nr_pages > pipe->buffers &&
-			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+			size > max_size && !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
 	user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
@@ -1131,18 +1145,24 @@  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 	unsigned int rounded_pipe_max_size;
 	int ret;
 
+	mutex_lock(&pipe_max_mutex);
 	orig_pipe_max_size = pipe_max_size;
 	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
-	if (ret < 0 || !write)
+	if (ret < 0 || !write) {
+		mutex_unlock(&pipe_max_mutex);
 		return ret;
+	}
 
 	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
 	if (rounded_pipe_max_size == 0) {
 		pipe_max_size = orig_pipe_max_size;
+		mutex_unlock(&pipe_max_mutex);
 		return -EINVAL;
 	}
 
 	pipe_max_size = rounded_pipe_max_size;
+	mutex_unlock(&pipe_max_mutex);
+
 	return ret;
 }