diff mbox

[08/11] SUNRPC: get rid of the request wait queue

Message ID 1407085393-3175-9-git-send-email-trond.myklebust@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Trond Myklebust Aug. 3, 2014, 5:03 p.m. UTC
We're always _only_ waking up tasks from within the sp_threads list, so
we know that they are enqueued and alive. The rq_wait waitqueue is just
a distraction with extra atomic semantics.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/svc.h |  1 -
 net/sunrpc/svc.c           |  2 --
 net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
 3 files changed, 17 insertions(+), 18 deletions(-)

Comments

J. Bruce Fields Aug. 12, 2014, 7:53 p.m. UTC | #1
On Sun, Aug 03, 2014 at 01:03:10PM -0400, Trond Myklebust wrote:
> We're always _only_ waking up tasks from within the sp_threads list, so
> we know that they are enqueued and alive. The rq_wait waitqueue is just
> a distraction with extra atomic semantics.
> 
> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
> ---
>  include/linux/sunrpc/svc.h |  1 -
>  net/sunrpc/svc.c           |  2 --
>  net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
>  3 files changed, 17 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 1bc7cd05b22e..3ec769b65c7f 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -280,7 +280,6 @@ struct svc_rqst {
>  	int			rq_splice_ok;   /* turned off in gss privacy
>  						 * to prevent encrypting page
>  						 * cache pages */
> -	wait_queue_head_t	rq_wait;	/* synchronization */
>  	struct task_struct	*rq_task;	/* service thread */
>  };
>  
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 5de6801cd924..dfb78c4f3031 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>  	if (!rqstp)
>  		goto out_enomem;
>  
> -	init_waitqueue_head(&rqstp->rq_wait);
> -
>  	serv->sv_nrthreads++;
>  	spin_lock_bh(&pool->sp_lock);
>  	pool->sp_nrthreads++;
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index 2c30193c7a13..438e91c12851 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -348,8 +348,6 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>  
>  	cpu = get_cpu();
>  	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
> -	put_cpu();
> -
>  	spin_lock_bh(&pool->sp_lock);
>  
>  	if (!list_empty(&pool->sp_threads) &&
> @@ -382,10 +380,15 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>  			printk(KERN_ERR
>  				"svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
>  				rqstp, rqstp->rq_xprt);
> -		rqstp->rq_xprt = xprt;
> +		/* Note the order of the following 3 lines:
> +		 * We want to assign xprt to rqstp->rq_xprt only _after_
> +		 * we've woken up the process, so that we don't race with
> +		 * the lockless check in svc_get_next_xprt().

Sorry, I'm not following this: what exactly is the race?

--b.

> +		 */
>  		svc_xprt_get(xprt);
> +		wake_up_process(rqstp->rq_task);
> +		rqstp->rq_xprt = xprt;
>  		pool->sp_stats.threads_woken++;
> -		wake_up(&rqstp->rq_wait);
>  	} else {
>  		dprintk("svc: transport %p put into queue\n", xprt);
>  		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
> @@ -394,6 +397,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>  
>  out_unlock:
>  	spin_unlock_bh(&pool->sp_lock);
> +	put_cpu();
>  }
>  
>  /*
> @@ -509,7 +513,7 @@ void svc_wake_up(struct svc_serv *serv)
>  			svc_thread_dequeue(pool, rqstp);
>  			rqstp->rq_xprt = NULL;
>  			 */
> -			wake_up(&rqstp->rq_wait);
> +			wake_up_process(rqstp->rq_task);
>  		} else
>  			pool->sp_task_pending = 1;
>  		spin_unlock_bh(&pool->sp_lock);
> @@ -628,7 +632,6 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
>  {
>  	struct svc_xprt *xprt;
>  	struct svc_pool		*pool = rqstp->rq_pool;
> -	DECLARE_WAITQUEUE(wait, current);
>  	long			time_left;
>  
>  	/* Normally we will wait up to 5 seconds for any required
> @@ -654,15 +657,15 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
>  			xprt = ERR_PTR(-EAGAIN);
>  			goto out;
>  		}
> -		/* No data pending. Go to sleep */
> -		svc_thread_enqueue(pool, rqstp);
> -
>  		/*
>  		 * We have to be able to interrupt this wait
>  		 * to bring down the daemons ...
>  		 */
>  		set_current_state(TASK_INTERRUPTIBLE);
>  
> +		/* No data pending. Go to sleep */
> +		svc_thread_enqueue(pool, rqstp);
> +
>  		/*
>  		 * checking kthread_should_stop() here allows us to avoid
>  		 * locking and signalling when stopping kthreads that call
> @@ -676,14 +679,13 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
>  			goto out;
>  		}
>  
> -		add_wait_queue(&rqstp->rq_wait, &wait);
>  		spin_unlock_bh(&pool->sp_lock);
>  
>  		time_left = schedule_timeout(timeout);
> +		__set_current_state(TASK_RUNNING);
>  
>  		try_to_freeze();
>  
> -		remove_wait_queue(&rqstp->rq_wait, &wait);
>  		xprt = rqstp->rq_xprt;
>  		if (xprt != NULL)
>  			return xprt;
> @@ -786,10 +788,10 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
>  		printk(KERN_ERR
>  			"svc_recv: service %p, transport not NULL!\n",
>  			 rqstp);
> -	if (waitqueue_active(&rqstp->rq_wait))
> -		printk(KERN_ERR
> -			"svc_recv: service %p, wait queue active!\n",
> -			 rqstp);
> +
> +	/* Make sure the task pointer is set! */
> +	if (WARN_ON_ONCE(!rqstp->rq_task))
> +		rqstp->rq_task = current_task;
>  
>  	err = svc_alloc_arg(rqstp);
>  	if (err)
> -- 
> 1.9.3
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Aug. 12, 2014, 8:09 p.m. UTC | #2
On Tue, Aug 12, 2014 at 3:53 PM, Bruce Fields <bfields@fieldses.org> wrote:
> On Sun, Aug 03, 2014 at 01:03:10PM -0400, Trond Myklebust wrote:
>> We're always _only_ waking up tasks from within the sp_threads list, so
>> we know that they are enqueued and alive. The rq_wait waitqueue is just
>> a distraction with extra atomic semantics.
>>
>> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
>> ---
>>  include/linux/sunrpc/svc.h |  1 -
>>  net/sunrpc/svc.c           |  2 --
>>  net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
>>  3 files changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>> index 1bc7cd05b22e..3ec769b65c7f 100644
>> --- a/include/linux/sunrpc/svc.h
>> +++ b/include/linux/sunrpc/svc.h
>> @@ -280,7 +280,6 @@ struct svc_rqst {
>>       int                     rq_splice_ok;   /* turned off in gss privacy
>>                                                * to prevent encrypting page
>>                                                * cache pages */
>> -     wait_queue_head_t       rq_wait;        /* synchronization */
>>       struct task_struct      *rq_task;       /* service thread */
>>  };
>>
>> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> index 5de6801cd924..dfb78c4f3031 100644
>> --- a/net/sunrpc/svc.c
>> +++ b/net/sunrpc/svc.c
>> @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>>       if (!rqstp)
>>               goto out_enomem;
>>
>> -     init_waitqueue_head(&rqstp->rq_wait);
>> -
>>       serv->sv_nrthreads++;
>>       spin_lock_bh(&pool->sp_lock);
>>       pool->sp_nrthreads++;
>> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
>> index 2c30193c7a13..438e91c12851 100644
>> --- a/net/sunrpc/svc_xprt.c
>> +++ b/net/sunrpc/svc_xprt.c
>> @@ -348,8 +348,6 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>>
>>       cpu = get_cpu();
>>       pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
>> -     put_cpu();
>> -
>>       spin_lock_bh(&pool->sp_lock);
>>
>>       if (!list_empty(&pool->sp_threads) &&
>> @@ -382,10 +380,15 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>>                       printk(KERN_ERR
>>                               "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
>>                               rqstp, rqstp->rq_xprt);
>> -             rqstp->rq_xprt = xprt;
>> +             /* Note the order of the following 3 lines:
>> +              * We want to assign xprt to rqstp->rq_xprt only _after_
>> +              * we've woken up the process, so that we don't race with
>> +              * the lockless check in svc_get_next_xprt().
>
> Sorry, I'm not following this: what exactly is the race?

There are 2 potential races, both due to the lockless check for rqstp->rq_xprt.

1) You need to ensure that the reference count in xprt is bumped
before assigning to rqstp->rq_xprt, because svc_get_next_xprt() does a
lockless check for rqstp->rq_xprt != NULL, so there is no lock to
ensure that the refcount bump occurs before whoever called
svc_get_next_xprt() calls svc_xprt_put()...

2) You want to ensure that you don't call wake_up_process() after
exiting svc_get_next_xprt() since you would no longer be guaranteed
that the task still exists. By calling wake_up_process() before
setting rqstp->rq_xprt, you ensure that if the task wakes up before
calling wake_up_process(), then the test for rqstp->rq_xprt == NULL
forces you into the spin_lock_bh() protected region, where it is safe
to test rqstp->rq_xprt again and decide whether or not the task is
still queued on &pool->sp_threads.
J. Bruce Fields Aug. 12, 2014, 8:23 p.m. UTC | #3
On Tue, Aug 12, 2014 at 04:09:52PM -0400, Trond Myklebust wrote:
> On Tue, Aug 12, 2014 at 3:53 PM, Bruce Fields <bfields@fieldses.org> wrote:
> > On Sun, Aug 03, 2014 at 01:03:10PM -0400, Trond Myklebust wrote:
> >> We're always _only_ waking up tasks from within the sp_threads list, so
> >> we know that they are enqueued and alive. The rq_wait waitqueue is just
> >> a distraction with extra atomic semantics.
> >>
> >> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
> >> ---
> >>  include/linux/sunrpc/svc.h |  1 -
> >>  net/sunrpc/svc.c           |  2 --
> >>  net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
> >>  3 files changed, 17 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> >> index 1bc7cd05b22e..3ec769b65c7f 100644
> >> --- a/include/linux/sunrpc/svc.h
> >> +++ b/include/linux/sunrpc/svc.h
> >> @@ -280,7 +280,6 @@ struct svc_rqst {
> >>       int                     rq_splice_ok;   /* turned off in gss privacy
> >>                                                * to prevent encrypting page
> >>                                                * cache pages */
> >> -     wait_queue_head_t       rq_wait;        /* synchronization */
> >>       struct task_struct      *rq_task;       /* service thread */
> >>  };
> >>
> >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> >> index 5de6801cd924..dfb78c4f3031 100644
> >> --- a/net/sunrpc/svc.c
> >> +++ b/net/sunrpc/svc.c
> >> @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
> >>       if (!rqstp)
> >>               goto out_enomem;
> >>
> >> -     init_waitqueue_head(&rqstp->rq_wait);
> >> -
> >>       serv->sv_nrthreads++;
> >>       spin_lock_bh(&pool->sp_lock);
> >>       pool->sp_nrthreads++;
> >> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> >> index 2c30193c7a13..438e91c12851 100644
> >> --- a/net/sunrpc/svc_xprt.c
> >> +++ b/net/sunrpc/svc_xprt.c
> >> @@ -348,8 +348,6 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> >>
> >>       cpu = get_cpu();
> >>       pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
> >> -     put_cpu();
> >> -
> >>       spin_lock_bh(&pool->sp_lock);
> >>
> >>       if (!list_empty(&pool->sp_threads) &&
> >> @@ -382,10 +380,15 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> >>                       printk(KERN_ERR
> >>                               "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
> >>                               rqstp, rqstp->rq_xprt);
> >> -             rqstp->rq_xprt = xprt;
> >> +             /* Note the order of the following 3 lines:
> >> +              * We want to assign xprt to rqstp->rq_xprt only _after_
> >> +              * we've woken up the process, so that we don't race with
> >> +              * the lockless check in svc_get_next_xprt().
> >
> > Sorry, I'm not following this: what exactly is the race?
> 
> There are 2 potential races, both due to the lockless check for rqstp->rq_xprt.
> 
> 1) You need to ensure that the reference count in xprt is bumped
> before assigning to rqstp->rq_xprt, because svc_get_next_xprt() does a
> lockless check for rqstp->rq_xprt != NULL, so there is no lock to
> ensure that the refcount bump occurs before whoever called
> svc_get_next_xprt() calls svc_xprt_put()...
> 
> 2) You want to ensure that you don't call wake_up_process() after
> exiting svc_get_next_xprt() since you would no longer be guaranteed
> that the task still exists. By calling wake_up_process() before
> setting rqstp->rq_xprt, you ensure that if the task wakes up before
> calling wake_up_process(), then the test for rqstp->rq_xprt == NULL
> forces you into the spin_lock_bh() protected region, where it is safe
> to test rqstp->rq_xprt again and decide whether or not the task is
> still queued on &pool->sp_threads.

Got it.  So how about making that comment:

	/*
	 * Once we assign rq_xprt, a concurrent task in
	 * svc_get_next_xprt() could put the xprt, or could exit.
	 * Therefore, the get and the wake_up need to come first.
	 */

Is that close enough?

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Aug. 12, 2014, 8:54 p.m. UTC | #4
On Tue, Aug 12, 2014 at 4:23 PM, Bruce Fields <bfields@fieldses.org> wrote:
> On Tue, Aug 12, 2014 at 04:09:52PM -0400, Trond Myklebust wrote:
>> On Tue, Aug 12, 2014 at 3:53 PM, Bruce Fields <bfields@fieldses.org> wrote:
>> > On Sun, Aug 03, 2014 at 01:03:10PM -0400, Trond Myklebust wrote:
>> >> We're always _only_ waking up tasks from within the sp_threads list, so
>> >> we know that they are enqueued and alive. The rq_wait waitqueue is just
>> >> a distraction with extra atomic semantics.
>> >>
>> >> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
>> >> ---
>> >>  include/linux/sunrpc/svc.h |  1 -
>> >>  net/sunrpc/svc.c           |  2 --
>> >>  net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
>> >>  3 files changed, 17 insertions(+), 18 deletions(-)
>> >>
>> >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>> >> index 1bc7cd05b22e..3ec769b65c7f 100644
>> >> --- a/include/linux/sunrpc/svc.h
>> >> +++ b/include/linux/sunrpc/svc.h
>> >> @@ -280,7 +280,6 @@ struct svc_rqst {
>> >>       int                     rq_splice_ok;   /* turned off in gss privacy
>> >>                                                * to prevent encrypting page
>> >>                                                * cache pages */
>> >> -     wait_queue_head_t       rq_wait;        /* synchronization */
>> >>       struct task_struct      *rq_task;       /* service thread */
>> >>  };
>> >>
>> >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> >> index 5de6801cd924..dfb78c4f3031 100644
>> >> --- a/net/sunrpc/svc.c
>> >> +++ b/net/sunrpc/svc.c
>> >> @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>> >>       if (!rqstp)
>> >>               goto out_enomem;
>> >>
>> >> -     init_waitqueue_head(&rqstp->rq_wait);
>> >> -
>> >>       serv->sv_nrthreads++;
>> >>       spin_lock_bh(&pool->sp_lock);
>> >>       pool->sp_nrthreads++;
>> >> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
>> >> index 2c30193c7a13..438e91c12851 100644
>> >> --- a/net/sunrpc/svc_xprt.c
>> >> +++ b/net/sunrpc/svc_xprt.c
>> >> @@ -348,8 +348,6 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>> >>
>> >>       cpu = get_cpu();
>> >>       pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
>> >> -     put_cpu();
>> >> -
>> >>       spin_lock_bh(&pool->sp_lock);
>> >>
>> >>       if (!list_empty(&pool->sp_threads) &&
>> >> @@ -382,10 +380,15 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>> >>                       printk(KERN_ERR
>> >>                               "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
>> >>                               rqstp, rqstp->rq_xprt);
>> >> -             rqstp->rq_xprt = xprt;
>> >> +             /* Note the order of the following 3 lines:
>> >> +              * We want to assign xprt to rqstp->rq_xprt only _after_
>> >> +              * we've woken up the process, so that we don't race with
>> >> +              * the lockless check in svc_get_next_xprt().
>> >
>> > Sorry, I'm not following this: what exactly is the race?
>>
>> There are 2 potential races, both due to the lockless check for rqstp->rq_xprt.
>>
>> 1) You need to ensure that the reference count in xprt is bumped
>> before assigning to rqstp->rq_xprt, because svc_get_next_xprt() does a
>> lockless check for rqstp->rq_xprt != NULL, so there is no lock to
>> ensure that the refcount bump occurs before whoever called
>> svc_get_next_xprt() calls svc_xprt_put()...
>>
>> 2) You want to ensure that you don't call wake_up_process() after
>> exiting svc_get_next_xprt() since you would no longer be guaranteed
>> that the task still exists. By calling wake_up_process() before
>> setting rqstp->rq_xprt, you ensure that if the task wakes up before
>> calling wake_up_process(), then the test for rqstp->rq_xprt == NULL
>> forces you into the spin_lock_bh() protected region, where it is safe
>> to test rqstp->rq_xprt again and decide whether or not the task is
>> still queued on &pool->sp_threads.
>
> Got it.  So how about making that comment:
>
>         /*
>          * Once we assign rq_xprt, a concurrent task in
>          * svc_get_next_xprt() could put the xprt, or could exit.
>          * Therefore, the get and the wake_up need to come first.
>          */
>
> Is that close enough?
>

It's close: it's not so much any concurrent task, it's specifically
the one that we're trying to wake up that may find itself waking up
prematurely due to the schedule_timeout(), and then racing due to the
lockless check.
J. Bruce Fields Aug. 12, 2014, 9:29 p.m. UTC | #5
On Tue, Aug 12, 2014 at 04:54:53PM -0400, Trond Myklebust wrote:
> On Tue, Aug 12, 2014 at 4:23 PM, Bruce Fields <bfields@fieldses.org> wrote:
> > On Tue, Aug 12, 2014 at 04:09:52PM -0400, Trond Myklebust wrote:
> >> On Tue, Aug 12, 2014 at 3:53 PM, Bruce Fields <bfields@fieldses.org> wrote:
> >> > On Sun, Aug 03, 2014 at 01:03:10PM -0400, Trond Myklebust wrote:
> >> >> We're always _only_ waking up tasks from within the sp_threads list, so
> >> >> we know that they are enqueued and alive. The rq_wait waitqueue is just
> >> >> a distraction with extra atomic semantics.
> >> >>
> >> >> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
> >> >> ---
> >> >>  include/linux/sunrpc/svc.h |  1 -
> >> >>  net/sunrpc/svc.c           |  2 --
> >> >>  net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
> >> >>  3 files changed, 17 insertions(+), 18 deletions(-)
> >> >>
> >> >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> >> >> index 1bc7cd05b22e..3ec769b65c7f 100644
> >> >> --- a/include/linux/sunrpc/svc.h
> >> >> +++ b/include/linux/sunrpc/svc.h
> >> >> @@ -280,7 +280,6 @@ struct svc_rqst {
> >> >>       int                     rq_splice_ok;   /* turned off in gss privacy
> >> >>                                                * to prevent encrypting page
> >> >>                                                * cache pages */
> >> >> -     wait_queue_head_t       rq_wait;        /* synchronization */
> >> >>       struct task_struct      *rq_task;       /* service thread */
> >> >>  };
> >> >>
> >> >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> >> >> index 5de6801cd924..dfb78c4f3031 100644
> >> >> --- a/net/sunrpc/svc.c
> >> >> +++ b/net/sunrpc/svc.c
> >> >> @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
> >> >>       if (!rqstp)
> >> >>               goto out_enomem;
> >> >>
> >> >> -     init_waitqueue_head(&rqstp->rq_wait);
> >> >> -
> >> >>       serv->sv_nrthreads++;
> >> >>       spin_lock_bh(&pool->sp_lock);
> >> >>       pool->sp_nrthreads++;
> >> >> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> >> >> index 2c30193c7a13..438e91c12851 100644
> >> >> --- a/net/sunrpc/svc_xprt.c
> >> >> +++ b/net/sunrpc/svc_xprt.c
> >> >> @@ -348,8 +348,6 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> >> >>
> >> >>       cpu = get_cpu();
> >> >>       pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
> >> >> -     put_cpu();
> >> >> -
> >> >>       spin_lock_bh(&pool->sp_lock);
> >> >>
> >> >>       if (!list_empty(&pool->sp_threads) &&
> >> >> @@ -382,10 +380,15 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
> >> >>                       printk(KERN_ERR
> >> >>                               "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
> >> >>                               rqstp, rqstp->rq_xprt);
> >> >> -             rqstp->rq_xprt = xprt;
> >> >> +             /* Note the order of the following 3 lines:
> >> >> +              * We want to assign xprt to rqstp->rq_xprt only _after_
> >> >> +              * we've woken up the process, so that we don't race with
> >> >> +              * the lockless check in svc_get_next_xprt().
> >> >
> >> > Sorry, I'm not following this: what exactly is the race?
> >>
> >> There are 2 potential races, both due to the lockless check for rqstp->rq_xprt.
> >>
> >> 1) You need to ensure that the reference count in xprt is bumped
> >> before assigning to rqstp->rq_xprt, because svc_get_next_xprt() does a
> >> lockless check for rqstp->rq_xprt != NULL, so there is no lock to
> >> ensure that the refcount bump occurs before whoever called
> >> svc_get_next_xprt() calls svc_xprt_put()...
> >>
> >> 2) You want to ensure that you don't call wake_up_process() after
> >> exiting svc_get_next_xprt() since you would no longer be guaranteed
> >> that the task still exists. By calling wake_up_process() before
> >> setting rqstp->rq_xprt, you ensure that if the task wakes up before
> >> calling wake_up_process(), then the test for rqstp->rq_xprt == NULL
> >> forces you into the spin_lock_bh() protected region, where it is safe
> >> to test rqstp->rq_xprt again and decide whether or not the task is
> >> still queued on &pool->sp_threads.
> >
> > Got it.  So how about making that comment:
> >
> >         /*
> >          * Once we assign rq_xprt, a concurrent task in
> >          * svc_get_next_xprt() could put the xprt, or could exit.
> >          * Therefore, the get and the wake_up need to come first.
> >          */
> >
> > Is that close enough?
> >
> 
> It's close: it's not so much any concurrent task, it's specifically
> the one that we're trying to wake up that may find itself waking up
> prematurely due to the schedule_timeout(), and then racing due to the
> lockless check.


Got it.  I'll make it:

	/*
	 * The task rq_task could be concurrently executing the lockless
	 * check of rq_xprt in svc_get_next_xprt().  Once we set
	 * rq_xprt, that task could return from svc_get_next_xprt(), and
	 * then could put the last reference to the xprt, or could exit.
	 * Therefore, our get and wake_up need to come first:
	 */

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Aug. 12, 2014, 9:34 p.m. UTC | #6
On Tue, Aug 12, 2014 at 5:29 PM, Bruce Fields <bfields@fieldses.org> wrote:
> On Tue, Aug 12, 2014 at 04:54:53PM -0400, Trond Myklebust wrote:
>> On Tue, Aug 12, 2014 at 4:23 PM, Bruce Fields <bfields@fieldses.org> wrote:
>> > On Tue, Aug 12, 2014 at 04:09:52PM -0400, Trond Myklebust wrote:
>> >> On Tue, Aug 12, 2014 at 3:53 PM, Bruce Fields <bfields@fieldses.org> wrote:
>> >> > On Sun, Aug 03, 2014 at 01:03:10PM -0400, Trond Myklebust wrote:
>> >> >> We're always _only_ waking up tasks from within the sp_threads list, so
>> >> >> we know that they are enqueued and alive. The rq_wait waitqueue is just
>> >> >> a distraction with extra atomic semantics.
>> >> >>
>> >> >> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
>> >> >> ---
>> >> >>  include/linux/sunrpc/svc.h |  1 -
>> >> >>  net/sunrpc/svc.c           |  2 --
>> >> >>  net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++---------------
>> >> >>  3 files changed, 17 insertions(+), 18 deletions(-)
>> >> >>
>> >> >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>> >> >> index 1bc7cd05b22e..3ec769b65c7f 100644
>> >> >> --- a/include/linux/sunrpc/svc.h
>> >> >> +++ b/include/linux/sunrpc/svc.h
>> >> >> @@ -280,7 +280,6 @@ struct svc_rqst {
>> >> >>       int                     rq_splice_ok;   /* turned off in gss privacy
>> >> >>                                                * to prevent encrypting page
>> >> >>                                                * cache pages */
>> >> >> -     wait_queue_head_t       rq_wait;        /* synchronization */
>> >> >>       struct task_struct      *rq_task;       /* service thread */
>> >> >>  };
>> >> >>
>> >> >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> >> >> index 5de6801cd924..dfb78c4f3031 100644
>> >> >> --- a/net/sunrpc/svc.c
>> >> >> +++ b/net/sunrpc/svc.c
>> >> >> @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
>> >> >>       if (!rqstp)
>> >> >>               goto out_enomem;
>> >> >>
>> >> >> -     init_waitqueue_head(&rqstp->rq_wait);
>> >> >> -
>> >> >>       serv->sv_nrthreads++;
>> >> >>       spin_lock_bh(&pool->sp_lock);
>> >> >>       pool->sp_nrthreads++;
>> >> >> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
>> >> >> index 2c30193c7a13..438e91c12851 100644
>> >> >> --- a/net/sunrpc/svc_xprt.c
>> >> >> +++ b/net/sunrpc/svc_xprt.c
>> >> >> @@ -348,8 +348,6 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>> >> >>
>> >> >>       cpu = get_cpu();
>> >> >>       pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
>> >> >> -     put_cpu();
>> >> >> -
>> >> >>       spin_lock_bh(&pool->sp_lock);
>> >> >>
>> >> >>       if (!list_empty(&pool->sp_threads) &&
>> >> >> @@ -382,10 +380,15 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
>> >> >>                       printk(KERN_ERR
>> >> >>                               "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
>> >> >>                               rqstp, rqstp->rq_xprt);
>> >> >> -             rqstp->rq_xprt = xprt;
>> >> >> +             /* Note the order of the following 3 lines:
>> >> >> +              * We want to assign xprt to rqstp->rq_xprt only _after_
>> >> >> +              * we've woken up the process, so that we don't race with
>> >> >> +              * the lockless check in svc_get_next_xprt().
>> >> >
>> >> > Sorry, I'm not following this: what exactly is the race?
>> >>
>> >> There are 2 potential races, both due to the lockless check for rqstp->rq_xprt.
>> >>
>> >> 1) You need to ensure that the reference count in xprt is bumped
>> >> before assigning to rqstp->rq_xprt, because svc_get_next_xprt() does a
>> >> lockless check for rqstp->rq_xprt != NULL, so there is no lock to
>> >> ensure that the refcount bump occurs before whoever called
>> >> svc_get_next_xprt() calls svc_xprt_put()...
>> >>
>> >> 2) You want to ensure that you don't call wake_up_process() after
>> >> exiting svc_get_next_xprt() since you would no longer be guaranteed
>> >> that the task still exists. By calling wake_up_process() before
>> >> setting rqstp->rq_xprt, you ensure that if the task wakes up before
>> >> calling wake_up_process(), then the test for rqstp->rq_xprt == NULL
>> >> forces you into the spin_lock_bh() protected region, where it is safe
>> >> to test rqstp->rq_xprt again and decide whether or not the task is
>> >> still queued on &pool->sp_threads.
>> >
>> > Got it.  So how about making that comment:
>> >
>> >         /*
>> >          * Once we assign rq_xprt, a concurrent task in
>> >          * svc_get_next_xprt() could put the xprt, or could exit.
>> >          * Therefore, the get and the wake_up need to come first.
>> >          */
>> >
>> > Is that close enough?
>> >
>>
>> It's close: it's not so much any concurrent task, it's specifically
>> the one that we're trying to wake up that may find itself waking up
>> prematurely due to the schedule_timeout(), and then racing due to the
>> lockless check.
>
>
> Got it.  I'll make it:
>
>         /*
>          * The task rq_task could be concurrently executing the lockless
>          * check of rq_xprt in svc_get_next_xprt().  Once we set
>          * rq_xprt, that task could return from svc_get_next_xprt(), and
>          * then could put the last reference to the xprt, or could exit.
>          * Therefore, our get and wake_up need to come first:
>          */
>

Looks good.
diff mbox

Patch

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 1bc7cd05b22e..3ec769b65c7f 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -280,7 +280,6 @@  struct svc_rqst {
 	int			rq_splice_ok;   /* turned off in gss privacy
 						 * to prevent encrypting page
 						 * cache pages */
-	wait_queue_head_t	rq_wait;	/* synchronization */
 	struct task_struct	*rq_task;	/* service thread */
 };
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5de6801cd924..dfb78c4f3031 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -612,8 +612,6 @@  svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 	if (!rqstp)
 		goto out_enomem;
 
-	init_waitqueue_head(&rqstp->rq_wait);
-
 	serv->sv_nrthreads++;
 	spin_lock_bh(&pool->sp_lock);
 	pool->sp_nrthreads++;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 2c30193c7a13..438e91c12851 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -348,8 +348,6 @@  static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 
 	cpu = get_cpu();
 	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
-	put_cpu();
-
 	spin_lock_bh(&pool->sp_lock);
 
 	if (!list_empty(&pool->sp_threads) &&
@@ -382,10 +380,15 @@  static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 			printk(KERN_ERR
 				"svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
 				rqstp, rqstp->rq_xprt);
-		rqstp->rq_xprt = xprt;
+		/* Note the order of the following 3 lines:
+		 * We want to assign xprt to rqstp->rq_xprt only _after_
+		 * we've woken up the process, so that we don't race with
+		 * the lockless check in svc_get_next_xprt().
+		 */
 		svc_xprt_get(xprt);
+		wake_up_process(rqstp->rq_task);
+		rqstp->rq_xprt = xprt;
 		pool->sp_stats.threads_woken++;
-		wake_up(&rqstp->rq_wait);
 	} else {
 		dprintk("svc: transport %p put into queue\n", xprt);
 		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
@@ -394,6 +397,7 @@  static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 
 out_unlock:
 	spin_unlock_bh(&pool->sp_lock);
+	put_cpu();
 }
 
 /*
@@ -509,7 +513,7 @@  void svc_wake_up(struct svc_serv *serv)
 			svc_thread_dequeue(pool, rqstp);
 			rqstp->rq_xprt = NULL;
 			 */
-			wake_up(&rqstp->rq_wait);
+			wake_up_process(rqstp->rq_task);
 		} else
 			pool->sp_task_pending = 1;
 		spin_unlock_bh(&pool->sp_lock);
@@ -628,7 +632,6 @@  static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 {
 	struct svc_xprt *xprt;
 	struct svc_pool		*pool = rqstp->rq_pool;
-	DECLARE_WAITQUEUE(wait, current);
 	long			time_left;
 
 	/* Normally we will wait up to 5 seconds for any required
@@ -654,15 +657,15 @@  static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 			xprt = ERR_PTR(-EAGAIN);
 			goto out;
 		}
-		/* No data pending. Go to sleep */
-		svc_thread_enqueue(pool, rqstp);
-
 		/*
 		 * We have to be able to interrupt this wait
 		 * to bring down the daemons ...
 		 */
 		set_current_state(TASK_INTERRUPTIBLE);
 
+		/* No data pending. Go to sleep */
+		svc_thread_enqueue(pool, rqstp);
+
 		/*
 		 * checking kthread_should_stop() here allows us to avoid
 		 * locking and signalling when stopping kthreads that call
@@ -676,14 +679,13 @@  static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 			goto out;
 		}
 
-		add_wait_queue(&rqstp->rq_wait, &wait);
 		spin_unlock_bh(&pool->sp_lock);
 
 		time_left = schedule_timeout(timeout);
+		__set_current_state(TASK_RUNNING);
 
 		try_to_freeze();
 
-		remove_wait_queue(&rqstp->rq_wait, &wait);
 		xprt = rqstp->rq_xprt;
 		if (xprt != NULL)
 			return xprt;
@@ -786,10 +788,10 @@  int svc_recv(struct svc_rqst *rqstp, long timeout)
 		printk(KERN_ERR
 			"svc_recv: service %p, transport not NULL!\n",
 			 rqstp);
-	if (waitqueue_active(&rqstp->rq_wait))
-		printk(KERN_ERR
-			"svc_recv: service %p, wait queue active!\n",
-			 rqstp);
+
+	/* Make sure the task pointer is set! */
+	if (WARN_ON_ONCE(!rqstp->rq_task))
+		rqstp->rq_task = current_task;
 
 	err = svc_alloc_arg(rqstp);
 	if (err)