diff mbox

[RFC] fs: use a sequence counter instead of file_lock in fd_install

Message ID 20150416121628.GA20615@mguzik (mailing list archive)
State New, archived
Headers show

Commit Message

Mateusz Guzik April 16, 2015, 12:16 p.m. UTC
Hi,

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Hack below gets rid of the second lock usage.

It gives me a ~30% speedup (~300k ops -> ~400k ops) in a microbenchmark
where 16 threads create a pipe (2 fds) and call 2 * close.

Results are fluctuating and even with the patch sometimes drop to around
~300k ops. However, without the patch they never get higher.

I can come up with a better benchmark if that's necessary.

Comments?

Comments

Eric Dumazet April 16, 2015, 5:47 p.m. UTC | #1
On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> Hi,
> 
> Currently obtaining a new file descriptor results in locking fdtable
> twice - once in order to reserve a slot and second time to fill it.
> 
> Hack below gets rid of the second lock usage.
> 
> It gives me a ~30% speedup (~300k ops -> ~400k ops) in a microbenchmark
> where 16 threads create a pipe (2 fds) and call 2 * close.
> 
> Results are fluctuating and even with the patch sometimes drop to around
> ~300k ops. However, without the patch they never get higher.
> 
> I can come up with a better benchmark if that's necessary.

Please push a patch with this test program alone, it will serve as a
baseline.

I discussed with Al about this problem in LKS 2014 in Chicago.

I am pleased to see you are working on this !

Please find one comment enclosed.

> 
> Comments?
> 
> ==============================================
> 
> Subject: [PATCH] fs: use a sequence counter instead of file_lock in fd_install
> 
> Because the lock is not held, it is possible that fdtable will be
> reallocated as we fill it.
> 
> RCU is used to guarantee the old table is not freed just in case we
> happen to write to it (which is harmless).
> 
> sequence counter is used to ensure we updated the right table.
> 
> Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
> ---
>  fs/file.c               | 24 +++++++++++++++++++-----
>  include/linux/fdtable.h |  5 +++++
>  2 files changed, 24 insertions(+), 5 deletions(-)


>  void fd_install(unsigned int fd, struct file *file)
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 230f87b..9e41765 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -12,6 +12,7 @@
>  #include <linux/types.h>
>  #include <linux/init.h>
>  #include <linux/fs.h>
> +#include <linux/seqlock.h>
>  
>  #include <linux/atomic.h>
>  
> @@ -47,6 +48,7 @@ struct files_struct {
>     * read mostly part
>     */
>  	atomic_t count;
> +	seqcount_t fdt_seqcount;


You put fdt_seqcount in the 'read mostly part' of 'struct files_struct',
please move it in the 'written part'


>  	struct fdtable __rcu *fdt;
>  	struct fdtable fdtab;
>    /*
> @@ -69,6 +71,9 @@ struct dentry;
>  #define files_fdtable(files) \
>  	rcu_dereference_check_fdtable((files), (files)->fdt)
>  
> +#define files_fdtable_seq(files) \
> +	rcu_dereference((files)->fdt)
> +
>  /*
>   * The caller must ensure that fd table isn't shared or hold rcu or file lock
>   */


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro April 16, 2015, 6:09 p.m. UTC | #2
On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
>  	cur_fdt = files_fdtable(files);
>  	if (nr >= cur_fdt->max_fds) {
>  		/* Continue as planned */
> +		write_seqcount_begin(&files->fdt_seqcount);
>  		copy_fdtable(new_fdt, cur_fdt);
>  		rcu_assign_pointer(files->fdt, new_fdt);
> +		write_seqcount_end(&files->fdt_seqcount);
>  		if (cur_fdt != &files->fdtab)
>  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);

Interesting.  AFAICS, your test doesn't step anywhere near that path,
does it?  So basically you never hit the retries during that...
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 16, 2015, 8:42 p.m. UTC | #3
On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> >  	cur_fdt = files_fdtable(files);
> >  	if (nr >= cur_fdt->max_fds) {
> >  		/* Continue as planned */
> > +		write_seqcount_begin(&files->fdt_seqcount);
> >  		copy_fdtable(new_fdt, cur_fdt);
> >  		rcu_assign_pointer(files->fdt, new_fdt);
> > +		write_seqcount_end(&files->fdt_seqcount);
> >  		if (cur_fdt != &files->fdtab)
> >  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> 
> Interesting.  AFAICS, your test doesn't step anywhere near that path,
> does it?  So basically you never hit the retries during that...

Right, but then the table is almost never changed for a given process,
as we only increase it by power of two steps.

(So I scratch my initial comment, fdt_seqcount is really mostly read)



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 16, 2015, 8:55 p.m. UTC | #4
On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > >  	cur_fdt = files_fdtable(files);
> > >  	if (nr >= cur_fdt->max_fds) {
> > >  		/* Continue as planned */
> > > +		write_seqcount_begin(&files->fdt_seqcount);
> > >  		copy_fdtable(new_fdt, cur_fdt);
> > >  		rcu_assign_pointer(files->fdt, new_fdt);
> > > +		write_seqcount_end(&files->fdt_seqcount);
> > >  		if (cur_fdt != &files->fdtab)
> > >  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > 
> > Interesting.  AFAICS, your test doesn't step anywhere near that path,
> > does it?  So basically you never hit the retries during that...
> 
> Right, but then the table is almost never changed for a given process,
> as we only increase it by power of two steps.
> 
> (So I scratch my initial comment, fdt_seqcount is really mostly read)

I tested Mateusz patch with my opensock program, mimicking a bit more
what a server does (having lot of sockets)

24 threads running, doing close(randomfd())/socket() calls like crazy.

Before patch :

# time ./opensock 

real	0m10.863s
user	0m0.954s
sys	2m43.659s


After patch :

# time ./opensock

real	0m9.750s
user	0m0.804s
sys	2m18.034s

So this is an improvement for sure, but not massive.

perf record ./opensock ; report

    87.60%  opensock  [kernel.kallsyms]  [k] _raw_spin_lock                     
     1.57%  opensock  [kernel.kallsyms]  [k] find_next_zero_bit                 
     0.50%  opensock  [kernel.kallsyms]  [k] memset_erms                        
     0.44%  opensock  [kernel.kallsyms]  [k] __alloc_fd                         
     0.44%  opensock  [kernel.kallsyms]  [k] tcp_close                          
     0.43%  opensock  [kernel.kallsyms]  [k] get_empty_filp                     
     0.43%  opensock  [kernel.kallsyms]  [k] kmem_cache_free                    
     0.40%  opensock  [kernel.kallsyms]  [k] free_block                         
     0.34%  opensock  [kernel.kallsyms]  [k] __close_fd                         
     0.32%  opensock  [kernel.kallsyms]  [k] sk_alloc                           
     0.30%  opensock  [kernel.kallsyms]  [k] _raw_spin_lock_bh                  
     0.24%  opensock  [kernel.kallsyms]  [k] inet_csk_destroy_sock              
     0.22%  opensock  [kernel.kallsyms]  [k] kmem_cache_alloc                   
     0.22%  opensock  opensock           [.] __pthread_disable_asynccancel      
     0.21%  opensock  [kernel.kallsyms]  [k] lockref_put_return                 
     0.20%  opensock  [kernel.kallsyms]  [k] filp_close                         

perf record -g ./opensock ; perf report --stdio

    87.80%  opensock  [kernel.kallsyms]  [k] _raw_spin_lock                     
            |
            --- _raw_spin_lock
               |          
               |--52.70%-- __close_fd
               |          sys_close
               |          system_call_fastpath
               |          __libc_close
               |          |          
               |          |--98.97%-- 0x0
               |           --1.03%-- [...]
               |          
               |--46.41%-- __alloc_fd
               |          get_unused_fd_flags
               |          sock_map_fd
               |          sys_socket
               |          system_call_fastpath
               |          __socket
               |          |          
               |           --100.00%-- 0x0
                --0.89%-- [...]

     1.54%  opensock  [kernel.kallsyms]  [k] find_next_zero_bit                 
            |





--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mateusz Guzik April 16, 2015, 10 p.m. UTC | #5
On Thu, Apr 16, 2015 at 01:55:39PM -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> > On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > > >  	cur_fdt = files_fdtable(files);
> > > >  	if (nr >= cur_fdt->max_fds) {
> > > >  		/* Continue as planned */
> > > > +		write_seqcount_begin(&files->fdt_seqcount);
> > > >  		copy_fdtable(new_fdt, cur_fdt);
> > > >  		rcu_assign_pointer(files->fdt, new_fdt);
> > > > +		write_seqcount_end(&files->fdt_seqcount);
> > > >  		if (cur_fdt != &files->fdtab)
> > > >  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > > 
> > > Interesting.  AFAICS, your test doesn't step anywhere near that path,
> > > does it?  So basically you never hit the retries during that...
> > 
> > Right, but then the table is almost never changed for a given process,
> > as we only increase it by power of two steps.
> > 
> > (So I scratch my initial comment, fdt_seqcount is really mostly read)
> 
> I tested Mateusz patch with my opensock program, mimicking a bit more
> what a server does (having lot of sockets)
> 
> 24 threads running, doing close(randomfd())/socket() calls like crazy.
> 
> Before patch :
> 
> # time ./opensock 
> 
> real	0m10.863s
> user	0m0.954s
> sys	2m43.659s
> 
> 
> After patch :
> 
> # time ./opensock
> 
> real	0m9.750s
> user	0m0.804s
> sys	2m18.034s
> 
> So this is an improvement for sure, but not massive.
> 
> perf record ./opensock ; report
> 
>     87.80%  opensock  [kernel.kallsyms]  [k] _raw_spin_lock                     
>                |--52.70%-- __close_fd
>                |--46.41%-- __alloc_fd

My crap benchmark is here: http://people.redhat.com/~mguzik/pipebench.c
(compile with -pthread, run with -s 10 -n 16 for 10 second test + 16
threads)

As noted earlier it tends to go from rougly 300k ops/s to 400.

The fundamental problem here seems to be this pesky POSIX requirement of
providing the lowest possible fd on each allocation (as a side note
Linux breaks this with parallel fd allocs, where one of these backs off
the reservation, not that I believe this causes trouble).

Ideally a process-wide switch could be implemented (e.g.
prctl(SCRATCH_LOWEST_FD_REQ)) which would grant the kernel the freedom
to return any fd it wants, so it would be possible to have fd ranges
per thread and the like.

Having only a O_SCRATCH_POSIX flag passed to syscalls would still leave
close() as a bottleneck.

In the meantime I consider the approach taken in my patch as an ok
temporary improvement.
Mateusz Guzik April 16, 2015, 10:35 p.m. UTC | #6
On Thu, Apr 16, 2015 at 07:09:32PM +0100, Al Viro wrote:
> On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> >  	cur_fdt = files_fdtable(files);
> >  	if (nr >= cur_fdt->max_fds) {
> >  		/* Continue as planned */
> > +		write_seqcount_begin(&files->fdt_seqcount);
> >  		copy_fdtable(new_fdt, cur_fdt);
> >  		rcu_assign_pointer(files->fdt, new_fdt);
> > +		write_seqcount_end(&files->fdt_seqcount);
> >  		if (cur_fdt != &files->fdtab)
> >  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> 
> Interesting.  AFAICS, your test doesn't step anywhere near that path,
> does it?  So basically you never hit the retries during that...

well, yeah. In fact for non-shared tables one could go a step further
and just plop the pointer in, but I don't know if that makes much sense.
Other processes inspecting the table could get away with a data
dependency barrier. Closing would still take the lock, so you can only
suddenly see filp installed, but never one going away.

Now, as far as correctness goes, I think there is a bug in the patch
(which does not invalidate the idea though). Chances are I got a fix as
well.

Benchmark prog is here: http://people.redhat.com/~mguzik/pipebench.c
A modified version: http://people.redhat.com/~mguzik/fdi-fail.c

Benchmark is just doing pipe + close in a loop in multiple threads.

Modified version spawns threads, sleeps 100 ms and does dup(0, 300) to
reallocate the table while other threads continue the work.

This succesfully tested retries (along with cases where installed file
got copied and was encountered during retry).

However, I see sporadic close failures. I presume this is because of a
missing read barrier after write_seqcount_begin. Adding a smp_mb()
seems to solve the problem, but I could only test on 2 * 16 Intel(R)
Xeon(R) CPU E5-2660 0 @ 2.20GHz.

My memory barrier-fu is rather weak and I'm not that confident in my
crap suspicion here.

Thoughts?
Eric Dumazet April 16, 2015, 10:52 p.m. UTC | #7
On Fri, 2015-04-17 at 00:00 +0200, Mateusz Guzik wrote:
> On Thu, Apr 16, 2015 at 01:55:39PM -0700, Eric Dumazet wrote:
> > On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> > > On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > > > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
> > > > >  	cur_fdt = files_fdtable(files);
> > > > >  	if (nr >= cur_fdt->max_fds) {
> > > > >  		/* Continue as planned */
> > > > > +		write_seqcount_begin(&files->fdt_seqcount);
> > > > >  		copy_fdtable(new_fdt, cur_fdt);
> > > > >  		rcu_assign_pointer(files->fdt, new_fdt);
> > > > > +		write_seqcount_end(&files->fdt_seqcount);
> > > > >  		if (cur_fdt != &files->fdtab)
> > > > >  			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > > > 
> > > > Interesting.  AFAICS, your test doesn't step anywhere near that path,
> > > > does it?  So basically you never hit the retries during that...
> > > 
> > > Right, but then the table is almost never changed for a given process,
> > > as we only increase it by power of two steps.
> > > 
> > > (So I scratch my initial comment, fdt_seqcount is really mostly read)
> > 
> > I tested Mateusz patch with my opensock program, mimicking a bit more
> > what a server does (having lot of sockets)
> > 
> > 24 threads running, doing close(randomfd())/socket() calls like crazy.
> > 
> > Before patch :
> > 
> > # time ./opensock 
> > 
> > real	0m10.863s
> > user	0m0.954s
> > sys	2m43.659s
> > 
> > 
> > After patch :
> > 
> > # time ./opensock
> > 
> > real	0m9.750s
> > user	0m0.804s
> > sys	2m18.034s
> > 
> > So this is an improvement for sure, but not massive.
> > 
> > perf record ./opensock ; report
> > 
> >     87.80%  opensock  [kernel.kallsyms]  [k] _raw_spin_lock                     
> >                |--52.70%-- __close_fd
> >                |--46.41%-- __alloc_fd
> 
> My crap benchmark is here: http://people.redhat.com/~mguzik/pipebench.c
> (compile with -pthread, run with -s 10 -n 16 for 10 second test + 16
> threads)
> 
> As noted earlier it tends to go from rougly 300k ops/s to 400.
> 
> The fundamental problem here seems to be this pesky POSIX requirement of
> providing the lowest possible fd on each allocation (as a side note
> Linux breaks this with parallel fd allocs, where one of these backs off
> the reservation, not that I believe this causes trouble).

Note POSIX never talked about multi threads. The POSIX requirement came
from traditional linux stdin/stdout/stderr handling and legacy programs,
before dup2() even existed.


> 
> Ideally a process-wide switch could be implemented (e.g.
> prctl(SCRATCH_LOWEST_FD_REQ)) which would grant the kernel the freedom
> to return any fd it wants, so it would be possible to have fd ranges
> per thread and the like.

I played months ago with a SOCK_FD_FASTALLOC ;)

idea was to use a random starting point instead of 0.

But the bottleneck was really the spinlock, not the bit search, unless I
used 10 million fds in the program...

> 
> Having only a O_SCRATCH_POSIX flag passed to syscalls would still leave
> close() as a bottleneck.
> 
> In the meantime I consider the approach taken in my patch as an ok
> temporary improvement.

Yes please formally submit this patch.

Note that adding atomic bit operations could eventually allow to not
hold the spinlock at close() time.


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 17, 2015, 9:46 p.m. UTC | #8
On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> Hi,
> 
> Currently obtaining a new file descriptor results in locking fdtable
> twice - once in order to reserve a slot and second time to fill it

...


>  void __fd_install(struct files_struct *files, unsigned int fd,
>  		struct file *file)
>  {
> +	unsigned long seq;

	unsigned int seq;

>  	struct fdtable *fdt;
> -	spin_lock(&files->file_lock);
> -	fdt = files_fdtable(files);
> -	BUG_ON(fdt->fd[fd] != NULL);
> -	rcu_assign_pointer(fdt->fd[fd], file);
> -	spin_unlock(&files->file_lock);
> +
> +	rcu_read_lock();
> +	do {
> +		seq = read_seqcount_begin(&files->fdt_seqcount);
> +		fdt = files_fdtable_seq(files);
> +		/*
> +		 * Entry in the table can already be equal to file if we
> +		 * had to restart and copy_fdtable picked up our update.
> +		 */
> +		BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> +		rcu_assign_pointer(fdt->fd[fd], file);
> +		smp_mb();
> +	} while (__read_seqcount_retry(&files->fdt_seqcount, seq));
> +	rcu_read_unlock();
>  }
>  

So one problem here is :

As soon as  rcu_assign_pointer(fdt->fd[fd], file) is done, and other cpu
does one expand_fdtable() and releases files->file_lock, another cpu can
close(fd).

Then another cpu can reuse the [fd] now empty slot and install a new
file in it.

Then this cpu will crash here :

BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mateusz Guzik April 17, 2015, 10:16 p.m. UTC | #9
On Fri, Apr 17, 2015 at 02:46:56PM -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> > Hi,
> > 
> > Currently obtaining a new file descriptor results in locking fdtable
> > twice - once in order to reserve a slot and second time to fill it
> 
> ...
> 
> 
> >  void __fd_install(struct files_struct *files, unsigned int fd,
> >  		struct file *file)
> >  {
> > +	unsigned long seq;
> 
> 	unsigned int seq;
> 
> >  	struct fdtable *fdt;
> > -	spin_lock(&files->file_lock);
> > -	fdt = files_fdtable(files);
> > -	BUG_ON(fdt->fd[fd] != NULL);
> > -	rcu_assign_pointer(fdt->fd[fd], file);
> > -	spin_unlock(&files->file_lock);
> > +
> > +	rcu_read_lock();
> > +	do {
> > +		seq = read_seqcount_begin(&files->fdt_seqcount);
> > +		fdt = files_fdtable_seq(files);
> > +		/*
> > +		 * Entry in the table can already be equal to file if we
> > +		 * had to restart and copy_fdtable picked up our update.
> > +		 */
> > +		BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> > +		rcu_assign_pointer(fdt->fd[fd], file);
> > +		smp_mb();
> > +	} while (__read_seqcount_retry(&files->fdt_seqcount, seq));
> > +	rcu_read_unlock();
> >  }
> >  
> 
> So one problem here is :
> 
> As soon as  rcu_assign_pointer(fdt->fd[fd], file) is done, and other cpu
> does one expand_fdtable() and releases files->file_lock, another cpu can
> close(fd).
> 
> Then another cpu can reuse the [fd] now empty slot and install a new
> file in it.
> 
> Then this cpu will crash here :
> 
> BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> 

Ouch, this is so obvious now that you mention it. Really stupid
mistake on my side.

I would say this makes the use of seq counter impossible. Even if we
decided to fall back to a lock on retry, we cannot know what to do if
the slot is reserved - it very well could be that something called
close, and something else reserved the slot, so putting the file inside
could be really bad. In fact we would be putting a file for which we
don't have a reference anymore.

However, not all hope is lost and I still think we can speed things up.

A locking primitive which only locks stuff for current cpu and has
another mode where it locks stuff for all cpus would do the trick just
fine. I'm not a linux guy, quick search suggests 'lglock' would do what
I want.

table reallocation is an extremely rare operation, so this should be
fine. It would take the lock 'globally' for given table.

I'll play with this.
Al Viro April 17, 2015, 11:02 p.m. UTC | #10
On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:

> I would say this makes the use of seq counter impossible. Even if we
> decided to fall back to a lock on retry, we cannot know what to do if
> the slot is reserved - it very well could be that something called
> close, and something else reserved the slot, so putting the file inside
> could be really bad. In fact we would be putting a file for which we
> don't have a reference anymore.
> 
> However, not all hope is lost and I still think we can speed things up.
> 
> A locking primitive which only locks stuff for current cpu and has
> another mode where it locks stuff for all cpus would do the trick just
> fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> I want.
> 
> table reallocation is an extremely rare operation, so this should be
> fine. It would take the lock 'globally' for given table.

It would also mean percpu_alloc() for each descriptor table...
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 18, 2015, 7:41 p.m. UTC | #11
On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> 
> > I would say this makes the use of seq counter impossible. Even if we
> > decided to fall back to a lock on retry, we cannot know what to do if
> > the slot is reserved - it very well could be that something called
> > close, and something else reserved the slot, so putting the file inside
> > could be really bad. In fact we would be putting a file for which we
> > don't have a reference anymore.
> > 
> > However, not all hope is lost and I still think we can speed things up.
> > 
> > A locking primitive which only locks stuff for current cpu and has
> > another mode where it locks stuff for all cpus would do the trick just
> > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > I want.
> > 
> > table reallocation is an extremely rare operation, so this should be
> > fine. It would take the lock 'globally' for given table.
> 
> It would also mean percpu_alloc() for each descriptor table...

I would rather use an xchg() instead of rcu_assign_ponter()

old = xchg(&fdt->fd[fd], file);
if (unlikely(old))
	filp_close(old, files);

If threads are using close() on random fds, final result is not
guaranteed anyway.


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mateusz Guzik April 20, 2015, 1:06 p.m. UTC | #12
On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> 
> > I would say this makes the use of seq counter impossible. Even if we
> > decided to fall back to a lock on retry, we cannot know what to do if
> > the slot is reserved - it very well could be that something called
> > close, and something else reserved the slot, so putting the file inside
> > could be really bad. In fact we would be putting a file for which we
> > don't have a reference anymore.
> > 
> > However, not all hope is lost and I still think we can speed things up.
> > 
> > A locking primitive which only locks stuff for current cpu and has
> > another mode where it locks stuff for all cpus would do the trick just
> > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > I want.
> > 
> > table reallocation is an extremely rare operation, so this should be
> > fine. It would take the lock 'globally' for given table.
> 
> It would also mean percpu_alloc() for each descriptor table...

Well as it was noted I have not checked how it's implemented at the time
of writing the message. I agree embedding something like this into files
struct is a non-starter.

I would say this could work with a small set of locks, selected by hashing
struct files pointer.

Table resizing is supposed to be extremely rare - most processes should
not need it at all (if they do, the default size is too small and should
be adjusted). Not only that, the lock is only needed if the process in
question is multithreaded.

So I would say this would not contend in real-world workloads, but still
looks crappy.

Unfortunately the whole thing loses original appeal of a simple hack
with no potential perfomrance drawbacks. Maybe I'll hack it up later and
run some tests anyway.
Mateusz Guzik April 20, 2015, 1:41 p.m. UTC | #13
On Sat, Apr 18, 2015 at 12:41:38PM -0700, Eric Dumazet wrote:
> On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > 
> > > I would say this makes the use of seq counter impossible. Even if we
> > > decided to fall back to a lock on retry, we cannot know what to do if
> > > the slot is reserved - it very well could be that something called
> > > close, and something else reserved the slot, so putting the file inside
> > > could be really bad. In fact we would be putting a file for which we
> > > don't have a reference anymore.
> > > 
> > > However, not all hope is lost and I still think we can speed things up.
> > > 
> > > A locking primitive which only locks stuff for current cpu and has
> > > another mode where it locks stuff for all cpus would do the trick just
> > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > I want.
> > > 
> > > table reallocation is an extremely rare operation, so this should be
> > > fine. It would take the lock 'globally' for given table.
> > 
> > It would also mean percpu_alloc() for each descriptor table...
> 
> I would rather use an xchg() instead of rcu_assign_ponter()
> 
> old = xchg(&fdt->fd[fd], file);
> if (unlikely(old))
> 	filp_close(old, files);
> 
> If threads are using close() on random fds, final result is not
> guaranteed anyway.
> 

Well I don't see how could this be used to fix the problem.

If you are retrying and see NULL, you don't know whether your previous
update was not picked up by memcpy OR the fd got closed, which also
unreferenced the file you are installing. But you can't tell what
happened.

If you see non-NULL and what you found is not the file you are
installing, you know the file was freed so you can't close the old file.

One could try to introduce an invariant that files installed in a
lockless manner have to start with refcount 1, you still can't infer
anything from the fact that the counter is 1 when you retry (even if you
take the lock). It could have been duped, or even sent over a unix
socket and closed (although that awould surely require a solid pause in
execution) and who knows what else.

In general I would say this approach is too hard to get right to be
worthwile given expected speedup.
Mateusz Guzik April 20, 2015, 1:43 p.m. UTC | #14
On Mon, Apr 20, 2015 at 03:06:33PM +0200, Mateusz Guzik wrote:
> On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > 
> > > I would say this makes the use of seq counter impossible. Even if we
> > > decided to fall back to a lock on retry, we cannot know what to do if
> > > the slot is reserved - it very well could be that something called
> > > close, and something else reserved the slot, so putting the file inside
> > > could be really bad. In fact we would be putting a file for which we
> > > don't have a reference anymore.
> > > 
> > > However, not all hope is lost and I still think we can speed things up.
> > > 
> > > A locking primitive which only locks stuff for current cpu and has
> > > another mode where it locks stuff for all cpus would do the trick just
> > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > I want.
> > > 
> > > table reallocation is an extremely rare operation, so this should be
> > > fine. It would take the lock 'globally' for given table.
> > 
> > It would also mean percpu_alloc() for each descriptor table...
> 
> Well as it was noted I have not checked how it's implemented at the time
> of writing the message. I agree embedding something like this into files
> struct is a non-starter.
> 
> I would say this could work with a small set of locks, selected by hashing
> struct files pointer.
> 
> Table resizing is supposed to be extremely rare - most processes should
> not need it at all (if they do, the default size is too small and should
> be adjusted). Not only that, the lock is only needed if the process in
> question is multithreaded.
> 
> So I would say this would not contend in real-world workloads, but still
> looks crappy.
> 
> Unfortunately the whole thing loses original appeal of a simple hack
> with no potential perfomrance drawbacks. Maybe I'll hack it up later and
> run some tests anyway.
> 

I just came up with another stupid hack, but this time it could really
work just fine.

Note that the entire issue stems from the fact that the table can be
resized at any moment. If only we had a guarantee the table "stands
still", we would not even need that sequence couner. fd_install could
just plop the file in.

So a stupid hack which comes to mind tells the kernel to make sure the
table is big enough and then never resize it ever again (inherited on
fork, cleared on exec):
prctl(FDTABLE_SIZE_FIXED, BIGNUM);

or

dup2(0, BIGNUM); /* sizes the table appropriately */
close(BIGNUM);
prctl(FDTABLE_SIZE_FIXED);

Thoughts?
Mateusz Guzik April 20, 2015, 3:10 p.m. UTC | #15
On Mon, Apr 20, 2015 at 03:43:26PM +0200, Mateusz Guzik wrote:
> On Mon, Apr 20, 2015 at 03:06:33PM +0200, Mateusz Guzik wrote:
> > On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> > > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > > 
> > > > I would say this makes the use of seq counter impossible. Even if we
> > > > decided to fall back to a lock on retry, we cannot know what to do if
> > > > the slot is reserved - it very well could be that something called
> > > > close, and something else reserved the slot, so putting the file inside
> > > > could be really bad. In fact we would be putting a file for which we
> > > > don't have a reference anymore.
> > > > 
> > > > However, not all hope is lost and I still think we can speed things up.
> > > > 
> > > > A locking primitive which only locks stuff for current cpu and has
> > > > another mode where it locks stuff for all cpus would do the trick just
> > > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > > I want.
> > > > 
> > > > table reallocation is an extremely rare operation, so this should be
> > > > fine. It would take the lock 'globally' for given table.
> > > 
> > > It would also mean percpu_alloc() for each descriptor table...
> > 
> > Well as it was noted I have not checked how it's implemented at the time
> > of writing the message. I agree embedding something like this into files
> > struct is a non-starter.
> > 
> > I would say this could work with a small set of locks, selected by hashing
> > struct files pointer.
> > 
> > Table resizing is supposed to be extremely rare - most processes should
> > not need it at all (if they do, the default size is too small and should
> > be adjusted). Not only that, the lock is only needed if the process in
> > question is multithreaded.
> > 
> > So I would say this would not contend in real-world workloads, but still
> > looks crappy.
> > 
> > Unfortunately the whole thing loses original appeal of a simple hack
> > with no potential perfomrance drawbacks. Maybe I'll hack it up later and
> > run some tests anyway.
> > 
> 
> I just came up with another stupid hack, but this time it could really
> work just fine.
> 
> Note that the entire issue stems from the fact that the table can be
> resized at any moment. If only we had a guarantee the table "stands
> still", we would not even need that sequence couner. fd_install could
> just plop the file in.
> 
> So a stupid hack which comes to mind tells the kernel to make sure the
> table is big enough and then never resize it ever again (inherited on
> fork, cleared on exec):
> prctl(FDTABLE_SIZE_FIXED, BIGNUM);
> 
> or
> 
> dup2(0, BIGNUM); /* sizes the table appropriately */
> close(BIGNUM);
> prctl(FDTABLE_SIZE_FIXED);
> 
> Thoughts?

Sorry for spam but I came up with another hack. :)

The idea is that we can have a variable which would signify the that
given thread is playing with fd table in fd_install (kind of a lock
embedded into task_struct). We would also have a flag in files struct
indicating that a thread would like to resize it.

expand_fdtable would set the flag and iterate over all threads waiting
for all of them to have the var set to 0.

fd_install would set the var, test the flag and if needed would just
unset the var and take the spin lock associated with the table.

This way the common case (nobody resizes the table) is lockless.

Resizing operation can get expensive but that should be totally fine.

As a hack in a hack we could abuse rcu's counter to server as the "lock".

Thoughts?
Eric Dumazet April 20, 2015, 4:46 p.m. UTC | #16
On Mon, 2015-04-20 at 15:41 +0200, Mateusz Guzik wrote:
> On Sat, Apr 18, 2015 at 12:41:38PM -0700, Eric Dumazet wrote:
> > On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> > > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > > 
> > > > I would say this makes the use of seq counter impossible. Even if we
> > > > decided to fall back to a lock on retry, we cannot know what to do if
> > > > the slot is reserved - it very well could be that something called
> > > > close, and something else reserved the slot, so putting the file inside
> > > > could be really bad. In fact we would be putting a file for which we
> > > > don't have a reference anymore.
> > > > 
> > > > However, not all hope is lost and I still think we can speed things up.
> > > > 
> > > > A locking primitive which only locks stuff for current cpu and has
> > > > another mode where it locks stuff for all cpus would do the trick just
> > > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > > I want.
> > > > 
> > > > table reallocation is an extremely rare operation, so this should be
> > > > fine. It would take the lock 'globally' for given table.
> > > 
> > > It would also mean percpu_alloc() for each descriptor table...
> > 
> > I would rather use an xchg() instead of rcu_assign_ponter()
> > 
> > old = xchg(&fdt->fd[fd], file);
> > if (unlikely(old))
> > 	filp_close(old, files);
> > 
> > If threads are using close() on random fds, final result is not
> > guaranteed anyway.
> > 
> 
> Well I don't see how could this be used to fix the problem.
> 
> If you are retrying and see NULL, you don't know whether your previous
> update was not picked up by memcpy OR the fd got closed, which also
> unreferenced the file you are installing. But you can't tell what
> happened.
> 
> If you see non-NULL and what you found is not the file you are
> installing, you know the file was freed so you can't close the old file.
> 
> One could try to introduce an invariant that files installed in a
> lockless manner have to start with refcount 1, you still can't infer
> anything from the fact that the counter is 1 when you retry (even if you
> take the lock). It could have been duped, or even sent over a unix
> socket and closed (although that awould surely require a solid pause in
> execution) and who knows what else.
> 
> In general I would say this approach is too hard to get right to be
> worthwile given expected speedup.
> 

Hey, that's because I really meant (during the week end)


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 20, 2015, 4:48 p.m. UTC | #17
On Mon, 2015-04-20 at 09:46 -0700, Eric Dumazet wrote:

> 
> Hey, that's because I really meant (during the week end)

old = xchg(&fdt->fd[fd], file);
if (unlikely(old && old != file))
	filp_close(old, files);

That's going to be hard.

If you thought this problem was trivial, it probably means you missed
something.



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 20, 2015, 5:15 p.m. UTC | #18
On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:

> Sorry for spam but I came up with another hack. :)
> 
> The idea is that we can have a variable which would signify the that
> given thread is playing with fd table in fd_install (kind of a lock
> embedded into task_struct). We would also have a flag in files struct
> indicating that a thread would like to resize it.
> 
> expand_fdtable would set the flag and iterate over all threads waiting
> for all of them to have the var set to 0.

The opposite : you have to block them in some way and add a rcu_sched()
or something.

Another way would be to expand the table leaving the old one in place,
thanks to a new vrealloc() api. This would require all file tables to at
least use one page.

(Instead of use a new set of pages for the new vmalloc()ed area, reuse
the pages that are already mapped into previous vmalloc() area.

Right now, expanding 4M slots to 8M slots is taking a long time and is a
latency killer.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

==============================================

Subject: [PATCH] fs: use a sequence counter instead of file_lock in fd_install

Because the lock is not held, it is possible that fdtable will be
reallocated as we fill it.

RCU is used to guarantee the old table is not freed just in case we
happen to write to it (which is harmless).

sequence counter is used to ensure we updated the right table.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
---
 fs/file.c               | 24 +++++++++++++++++++-----
 include/linux/fdtable.h |  5 +++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89..bd1ef4c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -165,8 +165,10 @@  static int expand_fdtable(struct files_struct *files, int nr)
 	cur_fdt = files_fdtable(files);
 	if (nr >= cur_fdt->max_fds) {
 		/* Continue as planned */
+		write_seqcount_begin(&files->fdt_seqcount);
 		copy_fdtable(new_fdt, cur_fdt);
 		rcu_assign_pointer(files->fdt, new_fdt);
+		write_seqcount_end(&files->fdt_seqcount);
 		if (cur_fdt != &files->fdtab)
 			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 	} else {
@@ -256,6 +258,7 @@  struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	seqcount_init(&newf->fdt_seqcount);
 	newf->next_fd = 0;
 	new_fdt = &newf->fdtab;
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -429,6 +432,7 @@  void exit_files(struct task_struct *tsk)
 
 struct files_struct init_files = {
 	.count		= ATOMIC_INIT(1),
+	.fdt_seqcount	= SEQCNT_ZERO(fdt_seqcount),
 	.fdt		= &init_files.fdtab,
 	.fdtab		= {
 		.max_fds	= NR_OPEN_DEFAULT,
@@ -552,12 +556,22 @@  EXPORT_SYMBOL(put_unused_fd);
 void __fd_install(struct files_struct *files, unsigned int fd,
 		struct file *file)
 {
+	unsigned long seq;
 	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
+
+	rcu_read_lock();
+	do {
+		seq = read_seqcount_begin(&files->fdt_seqcount);
+		fdt = files_fdtable_seq(files);
+		/*
+		 * Entry in the table can already be equal to file if we
+		 * had to restart and copy_fdtable picked up our update.
+		 */
+		BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
+		rcu_assign_pointer(fdt->fd[fd], file);
+		smp_mb();
+	} while (__read_seqcount_retry(&files->fdt_seqcount, seq));
+	rcu_read_unlock();
 }
 
 void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87b..9e41765 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@ 
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/seqlock.h>
 
 #include <linux/atomic.h>
 
@@ -47,6 +48,7 @@  struct files_struct {
    * read mostly part
    */
 	atomic_t count;
+	seqcount_t fdt_seqcount;
 	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
@@ -69,6 +71,9 @@  struct dentry;
 #define files_fdtable(files) \
 	rcu_dereference_check_fdtable((files), (files)->fdt)
 
+#define files_fdtable_seq(files) \
+	rcu_dereference((files)->fdt)
+
 /*
  * The caller must ensure that fd table isn't shared or hold rcu or file lock
  */