diff mbox

[[RFC] 1/1] SUNRPC: dynamic rpc_slot allocator for TCP

Message ID 1304386808-2733-2-git-send-email-andros@netapp.com
State New, archived
Headers show

Commit Message

Andy Adamson May 3, 2011, 1:40 a.m. UTC
From: Andy Adamson <andros@netapp.com>

Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
can fully utilize the negotiated TCP window.

Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
rpc_xprt allocation.

Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
For TCP, trigger a dyamic slot allocation in response to a write_space
callback which is in turn called when the TCP layer is waiting for buffer space.

Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
allocator uses GFP_NOWAIT and will return without allocating a slot if
GFP_NOWAIT allocation fails. This is OK because the write_space callback will
be called again, and the dynamic slot allocator can retry.

Signed-off-by: Andy Adamson <andros@netap.com>
---
 include/linux/sunrpc/sched.h |    2 +
 include/linux/sunrpc/xprt.h  |    6 +++-
 net/sunrpc/clnt.c            |    4 ++
 net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
 net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
 net/sunrpc/xprtsock.c        |    1 +
 6 files changed, 117 insertions(+), 10 deletions(-)

Comments

Jeff Layton May 4, 2011, 12:20 a.m. UTC | #1
On Mon,  2 May 2011 21:40:08 -0400
andros@netapp.com wrote:

> From: Andy Adamson <andros@netapp.com>
> 
> Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
> can fully utilize the negotiated TCP window.
> 
> Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
> cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
> rpc_xprt allocation.
> 
> Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
> For TCP, trigger a dyamic slot allocation in response to a write_space
> callback which is in turn called when the TCP layer is waiting for buffer space.
> 
> Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
> allocator uses GFP_NOWAIT and will return without allocating a slot if
> GFP_NOWAIT allocation fails. This is OK because the write_space callback will
> be called again, and the dynamic slot allocator can retry.
> 
> Signed-off-by: Andy Adamson <andros@netap.com>
> ---
>  include/linux/sunrpc/sched.h |    2 +
>  include/linux/sunrpc/xprt.h  |    6 +++-
>  net/sunrpc/clnt.c            |    4 ++
>  net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
>  net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
>  net/sunrpc/xprtsock.c        |    1 +
>  6 files changed, 117 insertions(+), 10 deletions(-)
> 

Nice work, comments inline below...


[...]

> +
> +/*
> + * Static transport rpc_slot allocation called only at rpc_xprt allocation.
> + * No need to take the xprt->reserve_lock.
> + */
> +int
> +xprt_alloc_slot_entries(struct rpc_xprt *xprt, int num_req)
> +{
> +	struct rpc_rqst *req;
> +	int i;
> +
> +	for (i = 0; i < num_req; i++) {
> +		req = rpc_alloc_slot(GFP_KERNEL);
> +		if (!req)
> +			return -ENOMEM;
> +		memset(req, 0, sizeof(*req));
> +		list_add(&req->rq_list, &xprt->free);
> +	}
> +	dprintk("<-- %s mempool_alloc %d reqs\n", __func__,
> +		xprt->max_reqs);
> +	return 0;
> +}
> +

So, I don't quite get this...

You declare a global mempool early on, and then allocate from that
mempool for a list of static entries. Doesn't that sort of rob you of
any benefit of using a mempool here? IOW, won't the static allocations
potentially rob the mempool of "guaranteed" entries such that the
dynamic ones eventually all turn into slab allocations anyway?

What I think would make more sense would be to have multiple mempools
-- one per xprt and simply set the mempool size to the number of
"static" entries that you want for the mempool. Then you could get rid
of the free list, and just do allocations out of the mempool directly.
You'll be guaranteed to be able to allocate up to the number in the
mempool and everything above that would just becomes a slab allocation.
Trond Myklebust May 4, 2011, 12:44 a.m. UTC | #2
On Tue, 2011-05-03 at 20:20 -0400, Jeff Layton wrote:
> On Mon,  2 May 2011 21:40:08 -0400
> andros@netapp.com wrote:
> 
> > From: Andy Adamson <andros@netapp.com>
> > 
> > Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
> > can fully utilize the negotiated TCP window.
> > 
> > Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
> > cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
> > rpc_xprt allocation.
> > 
> > Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
> > For TCP, trigger a dyamic slot allocation in response to a write_space
> > callback which is in turn called when the TCP layer is waiting for buffer space.
> > 
> > Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
> > allocator uses GFP_NOWAIT and will return without allocating a slot if
> > GFP_NOWAIT allocation fails. This is OK because the write_space callback will
> > be called again, and the dynamic slot allocator can retry.
> > 
> > Signed-off-by: Andy Adamson <andros@netap.com>
> > ---
> >  include/linux/sunrpc/sched.h |    2 +
> >  include/linux/sunrpc/xprt.h  |    6 +++-
> >  net/sunrpc/clnt.c            |    4 ++
> >  net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
> >  net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
> >  net/sunrpc/xprtsock.c        |    1 +
> >  6 files changed, 117 insertions(+), 10 deletions(-)
> > 
> 
> Nice work, comments inline below...
> 
> 
> [...]
> 
> > +
> > +/*
> > + * Static transport rpc_slot allocation called only at rpc_xprt allocation.
> > + * No need to take the xprt->reserve_lock.
> > + */
> > +int
> > +xprt_alloc_slot_entries(struct rpc_xprt *xprt, int num_req)
> > +{
> > +	struct rpc_rqst *req;
> > +	int i;
> > +
> > +	for (i = 0; i < num_req; i++) {
> > +		req = rpc_alloc_slot(GFP_KERNEL);
> > +		if (!req)
> > +			return -ENOMEM;
> > +		memset(req, 0, sizeof(*req));
> > +		list_add(&req->rq_list, &xprt->free);
> > +	}
> > +	dprintk("<-- %s mempool_alloc %d reqs\n", __func__,
> > +		xprt->max_reqs);
> > +	return 0;
> > +}
> > +
> 
> So, I don't quite get this...
> 
> You declare a global mempool early on, and then allocate from that
> mempool for a list of static entries. Doesn't that sort of rob you of
> any benefit of using a mempool here? IOW, won't the static allocations
> potentially rob the mempool of "guaranteed" entries such that the
> dynamic ones eventually all turn into slab allocations anyway?

The other thing is that we really should _never_ be using GFP_KERNEL
allocations in any paths that might be used for writebacks, since that
might recurse back into the filesystem due to direct memory reclaims. In
fact, since this code path will commonly be used by rpciod, then we
should rather be using GFP_ATOMIC and/or GFP_NOWAIT (see rpc_malloc).

> What I think would make more sense would be to have multiple mempools
> -- one per xprt and simply set the mempool size to the number of
> "static" entries that you want for the mempool. Then you could get rid
> of the free list, and just do allocations out of the mempool directly.
> You'll be guaranteed to be able to allocate up to the number in the
> mempool and everything above that would just becomes a slab allocation.

I'm not sure that we need multiple mempools: all we really require is
that memory reclaims to be able to make some form of progress.
I'd therefore rather advocate a global mempool (which matches our policy
on the rpc buffer pool), but that we allow the allocations to fail if
we're really low on memory.

The problem with multiple mempools is that if there is no activity on
one of those filesystems, then we're basically locking up memory for no
good reason.
NeilBrown May 4, 2011, 1:18 a.m. UTC | #3
On Tue, 03 May 2011 20:44:50 -0400 Trond Myklebust
<Trond.Myklebust@netapp.com> wrote:

> On Tue, 2011-05-03 at 20:20 -0400, Jeff Layton wrote:
> > On Mon,  2 May 2011 21:40:08 -0400
> > andros@netapp.com wrote:
> > 
> > > From: Andy Adamson <andros@netapp.com>
> > > 
> > > Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
> > > can fully utilize the negotiated TCP window.
> > > 
> > > Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
> > > cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
> > > rpc_xprt allocation.
> > > 
> > > Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
> > > For TCP, trigger a dyamic slot allocation in response to a write_space
> > > callback which is in turn called when the TCP layer is waiting for buffer space.
> > > 
> > > Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
> > > allocator uses GFP_NOWAIT and will return without allocating a slot if
> > > GFP_NOWAIT allocation fails. This is OK because the write_space callback will
> > > be called again, and the dynamic slot allocator can retry.
> > > 
> > > Signed-off-by: Andy Adamson <andros@netap.com>
> > > ---
> > >  include/linux/sunrpc/sched.h |    2 +
> > >  include/linux/sunrpc/xprt.h  |    6 +++-
> > >  net/sunrpc/clnt.c            |    4 ++
> > >  net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
> > >  net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
> > >  net/sunrpc/xprtsock.c        |    1 +
> > >  6 files changed, 117 insertions(+), 10 deletions(-)
> > > 
> > 
> > Nice work, comments inline below...
> > 
> > 
> > [...]
> > 
> > > +
> > > +/*
> > > + * Static transport rpc_slot allocation called only at rpc_xprt allocation.
> > > + * No need to take the xprt->reserve_lock.
> > > + */
> > > +int
> > > +xprt_alloc_slot_entries(struct rpc_xprt *xprt, int num_req)
> > > +{
> > > +	struct rpc_rqst *req;
> > > +	int i;
> > > +
> > > +	for (i = 0; i < num_req; i++) {
> > > +		req = rpc_alloc_slot(GFP_KERNEL);
> > > +		if (!req)
> > > +			return -ENOMEM;
> > > +		memset(req, 0, sizeof(*req));
> > > +		list_add(&req->rq_list, &xprt->free);
> > > +	}
> > > +	dprintk("<-- %s mempool_alloc %d reqs\n", __func__,
> > > +		xprt->max_reqs);
> > > +	return 0;
> > > +}
> > > +
> > 
> > So, I don't quite get this...
> > 
> > You declare a global mempool early on, and then allocate from that
> > mempool for a list of static entries. Doesn't that sort of rob you of
> > any benefit of using a mempool here? IOW, won't the static allocations
> > potentially rob the mempool of "guaranteed" entries such that the
> > dynamic ones eventually all turn into slab allocations anyway?
> 
> The other thing is that we really should _never_ be using GFP_KERNEL
> allocations in any paths that might be used for writebacks, since that
> might recurse back into the filesystem due to direct memory reclaims. In
> fact, since this code path will commonly be used by rpciod, then we
> should rather be using GFP_ATOMIC and/or GFP_NOWAIT (see rpc_malloc).
> 
> > What I think would make more sense would be to have multiple mempools
> > -- one per xprt and simply set the mempool size to the number of
> > "static" entries that you want for the mempool. Then you could get rid
> > of the free list, and just do allocations out of the mempool directly.
> > You'll be guaranteed to be able to allocate up to the number in the
> > mempool and everything above that would just becomes a slab allocation.
> 
> I'm not sure that we need multiple mempools: all we really require is
> that memory reclaims to be able to make some form of progress.
> I'd therefore rather advocate a global mempool (which matches our policy
> on the rpc buffer pool), but that we allow the allocations to fail if
> we're really low on memory.
> 
> The problem with multiple mempools is that if there is no activity on
> one of those filesystems, then we're basically locking up memory for no
> good reason.
> 

It might be worth pointing out that mempools only use the reserved memory as
a last resort.   They try a normal allocation first and only when that fails
do they dip into the pool.  This is because normal allocation avoids taking
global locks on common case - they are CPU-local.  The pool is a global
resource (i.e. for all CPUs) so a spinlock is needed.

As GFP_KERNEL essentially never fails, it will never dip into the pool, so
there is not a lot of point using it in a mempool allocation (though I guess
it could make the code cleaner/neater and that would be a good thing).

In general, mempools should be sized to the minimum number of entries
necessary to make forward progress when memory pressure is high.  This is
typically 1.  Making mempools larger than this might allow throughput to be a
bit higher during that high-memory-pressure time, but wastes memory all the
rest of the time.

You only need multiple mempools when they might be used on the one request.
i.e. if two separate allocation are needed to handle the one request, then
they must come from two separate mempools.
If two different allocations will always be for two independent requests, then
they can safely share a mempool.

For rpc slots, I doubt you need mempools at all.
Mempools are only needed if failure is not an option and you would rather
wait, but you cannot wait for regular writeback because you are on the
writeback path.  So you use a mempool which waits for a previous request to
complete.  I don't think that describes rpc slots at all.
For rpc slots, you can afford to wait when setting up the transport, as you
are not on the writeout path yet, and later you can always cope with failure.
So just use kmalloc.

NeilBrown
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton May 4, 2011, 1:33 a.m. UTC | #4
On Tue, 03 May 2011 20:44:50 -0400
Trond Myklebust <Trond.Myklebust@netapp.com> wrote:

> On Tue, 2011-05-03 at 20:20 -0400, Jeff Layton wrote:
> > On Mon,  2 May 2011 21:40:08 -0400
> > andros@netapp.com wrote:
> > 
> > > From: Andy Adamson <andros@netapp.com>
> > > 
> > > Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
> > > can fully utilize the negotiated TCP window.
> > > 
> > > Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
> > > cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
> > > rpc_xprt allocation.
> > > 
> > > Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
> > > For TCP, trigger a dyamic slot allocation in response to a write_space
> > > callback which is in turn called when the TCP layer is waiting for buffer space.
> > > 
> > > Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
> > > allocator uses GFP_NOWAIT and will return without allocating a slot if
> > > GFP_NOWAIT allocation fails. This is OK because the write_space callback will
> > > be called again, and the dynamic slot allocator can retry.
> > > 
> > > Signed-off-by: Andy Adamson <andros@netap.com>
> > > ---
> > >  include/linux/sunrpc/sched.h |    2 +
> > >  include/linux/sunrpc/xprt.h  |    6 +++-
> > >  net/sunrpc/clnt.c            |    4 ++
> > >  net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
> > >  net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
> > >  net/sunrpc/xprtsock.c        |    1 +
> > >  6 files changed, 117 insertions(+), 10 deletions(-)
> > > 
> > 
> > Nice work, comments inline below...
> > 
> > 
> > [...]
> > 
> > > +
> > > +/*
> > > + * Static transport rpc_slot allocation called only at rpc_xprt allocation.
> > > + * No need to take the xprt->reserve_lock.
> > > + */
> > > +int
> > > +xprt_alloc_slot_entries(struct rpc_xprt *xprt, int num_req)
> > > +{
> > > +	struct rpc_rqst *req;
> > > +	int i;
> > > +
> > > +	for (i = 0; i < num_req; i++) {
> > > +		req = rpc_alloc_slot(GFP_KERNEL);
> > > +		if (!req)
> > > +			return -ENOMEM;
> > > +		memset(req, 0, sizeof(*req));
> > > +		list_add(&req->rq_list, &xprt->free);
> > > +	}
> > > +	dprintk("<-- %s mempool_alloc %d reqs\n", __func__,
> > > +		xprt->max_reqs);
> > > +	return 0;
> > > +}
> > > +
> > 
> > So, I don't quite get this...
> > 
> > You declare a global mempool early on, and then allocate from that
> > mempool for a list of static entries. Doesn't that sort of rob you of
> > any benefit of using a mempool here? IOW, won't the static allocations
> > potentially rob the mempool of "guaranteed" entries such that the
> > dynamic ones eventually all turn into slab allocations anyway?
> 
> The other thing is that we really should _never_ be using GFP_KERNEL
> allocations in any paths that might be used for writebacks, since that
> might recurse back into the filesystem due to direct memory reclaims. In
> fact, since this code path will commonly be used by rpciod, then we
> should rather be using GFP_ATOMIC and/or GFP_NOWAIT (see rpc_malloc).
> 

I think GFP_NOWAIT is probably appropriate, and that seems to be what
the patch uses for the dynamic allocations. It only uses GFP_KERNEL for
the "static" ones (created at xprt_alloc time).

> > What I think would make more sense would be to have multiple mempools
> > -- one per xprt and simply set the mempool size to the number of
> > "static" entries that you want for the mempool. Then you could get rid
> > of the free list, and just do allocations out of the mempool directly.
> > You'll be guaranteed to be able to allocate up to the number in the
> > mempool and everything above that would just becomes a slab allocation.
> 
> I'm not sure that we need multiple mempools: all we really require is
> that memory reclaims to be able to make some form of progress.
> I'd therefore rather advocate a global mempool (which matches our policy
> on the rpc buffer pool), but that we allow the allocations to fail if
> we're really low on memory.
> 
> The problem with multiple mempools is that if there is no activity on
> one of those filesystems, then we're basically locking up memory for no
> good reason.
> 

But this patch does that too. It allocates a list of static entries at
xprt_alloc time that won't shrink when memory is tight. If that's
something we want to keep, then why bother with a mempool for the
"dynamic" entries? We'll already have a set of guaranteed entries (the
static ones).

The whole point of a mempool is to give you a pool of objects that has
a guaranteed minimum number. I think we just need to understand whether
that minimum should be a global one or per-xprt.
Trond Myklebust May 4, 2011, 1:46 a.m. UTC | #5
On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
> For rpc slots, I doubt you need mempools at all.
> Mempools are only needed if failure is not an option and you would rather
> wait, but you cannot wait for regular writeback because you are on the
> writeback path.  So you use a mempool which waits for a previous request to
> complete.  I don't think that describes rpc slots at all.
> For rpc slots, you can afford to wait when setting up the transport, as you
> are not on the writeout path yet, and later you can always cope with failure.
> So just use kmalloc.

Errr.... No. By the time you get to allocating an RPC slot, you may be
bang smack in the middle of the writeout path.

The scenario would be that something triggers a writeback (kswapd,
direct reclaim,...) which triggers an RPC call, which requires you to
allocate at least one rpc slot before you can put the write on the wire.

I agree with your assertion that we only need one successful slot
allocation in order to make overall forward progress, but we definitely
do need that one...
NeilBrown May 4, 2011, 2:07 a.m. UTC | #6
On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
<Trond.Myklebust@netapp.com> wrote:

> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
> > For rpc slots, I doubt you need mempools at all.
> > Mempools are only needed if failure is not an option and you would rather
> > wait, but you cannot wait for regular writeback because you are on the
> > writeback path.  So you use a mempool which waits for a previous request to
> > complete.  I don't think that describes rpc slots at all.
> > For rpc slots, you can afford to wait when setting up the transport, as you
> > are not on the writeout path yet, and later you can always cope with failure.
> > So just use kmalloc.
> 
> Errr.... No. By the time you get to allocating an RPC slot, you may be
> bang smack in the middle of the writeout path.
> 
> The scenario would be that something triggers a writeback (kswapd,
> direct reclaim,...) which triggers an RPC call, which requires you to
> allocate at least one rpc slot before you can put the write on the wire.
> 
> I agree with your assertion that we only need one successful slot
> allocation in order to make overall forward progress, but we definitely
> do need that one...
> 

Probably I misunderstood the code, but I thought that there was a base set of
slots preallocated.  Eventually one of those will become free won't it?

Looking at the code again, it only ever returns entries to the mempool when
it is about to destroy the xprt.  That makes no sense.  If you are using a
mempool, then you allocate when you need to use space, and free it as soon as
you have finished with it, so the next request can get a turn.

NeilBrown
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton May 4, 2011, 11:54 a.m. UTC | #7
On Wed, 4 May 2011 12:07:26 +1000
NeilBrown <neilb@suse.de> wrote:

> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
> <Trond.Myklebust@netapp.com> wrote:
> 
> > On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
> > > For rpc slots, I doubt you need mempools at all.
> > > Mempools are only needed if failure is not an option and you would rather
> > > wait, but you cannot wait for regular writeback because you are on the
> > > writeback path.  So you use a mempool which waits for a previous request to
> > > complete.  I don't think that describes rpc slots at all.
> > > For rpc slots, you can afford to wait when setting up the transport, as you
> > > are not on the writeout path yet, and later you can always cope with failure.
> > > So just use kmalloc.
> > 
> > Errr.... No. By the time you get to allocating an RPC slot, you may be
> > bang smack in the middle of the writeout path.
> > 
> > The scenario would be that something triggers a writeback (kswapd,
> > direct reclaim,...) which triggers an RPC call, which requires you to
> > allocate at least one rpc slot before you can put the write on the wire.
> > 
> > I agree with your assertion that we only need one successful slot
> > allocation in order to make overall forward progress, but we definitely
> > do need that one...
> > 
> 
> Probably I misunderstood the code, but I thought that there was a base set of
> slots preallocated.  Eventually one of those will become free won't it?
> 
> Looking at the code again, it only ever returns entries to the mempool when
> it is about to destroy the xprt.  That makes no sense.  If you are using a
> mempool, then you allocate when you need to use space, and free it as soon as
> you have finished with it, so the next request can get a turn.
> 

That was sort of my point, though I didn't articulate it very well.

I don't think we want to keep this "static" set of rqsts and the
mempool. It seems like that's somewhat redundant and more complex than
is warranted. The question is...which should we keep?

I think it sort of depends on what the intended model is. If we want to
allocate out of the "guaranteed" pool first and only when that's
exhausted fall back to doing dynamic allocations, then we probably want
to keep the static list of entries.

If we want to allocate dynamically most of the time and only dip into
the "guaranteed" pool when that can't be satisfied, then basing all of
this around mempools might make more sense.
Andy Adamson May 4, 2011, 2:54 p.m. UTC | #8
On May 4, 2011, at 7:54 AM, Jeff Layton wrote:

> On Wed, 4 May 2011 12:07:26 +1000
> NeilBrown <neilb@suse.de> wrote:
> 
>> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
>> <Trond.Myklebust@netapp.com> wrote:
>> 
>>> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
>>>> For rpc slots, I doubt you need mempools at all.
>>>> Mempools are only needed if failure is not an option and you would rather
>>>> wait, but you cannot wait for regular writeback because you are on the
>>>> writeback path.  So you use a mempool which waits for a previous request to
>>>> complete.  I don't think that describes rpc slots at all.
>>>> For rpc slots, you can afford to wait when setting up the transport, as you
>>>> are not on the writeout path yet, and later you can always cope with failure.
>>>> So just use kmalloc.
>>> 
>>> Errr.... No. By the time you get to allocating an RPC slot, you may be
>>> bang smack in the middle of the writeout path.
>>> 
>>> The scenario would be that something triggers a writeback (kswapd,
>>> direct reclaim,...) which triggers an RPC call, which requires you to
>>> allocate at least one rpc slot before you can put the write on the wire.
>>> 
>>> I agree with your assertion that we only need one successful slot
>>> allocation in order to make overall forward progress, but we definitely
>>> do need that one...
>>> 
>> 
>> Probably I misunderstood the code, but I thought that there was a base set of
>> slots preallocated.  Eventually one of those will become free won't it?
>> 
>> Looking at the code again, it only ever returns entries to the mempool when
>> it is about to destroy the xprt.  That makes no sense.  If you are using a
>> mempool, then you allocate when you need to use space, and free it as soon as
>> you have finished with it, so the next request can get a turn.
>> 
> 
> That was sort of my point, though I didn't articulate it very well.
> 
> I don't think we want to keep this "static" set of rqsts and the
> mempool. It seems like that's somewhat redundant and more complex than
> is warranted. The question is...which should we keep?
> 
> I think it sort of depends on what the intended model is. If we want to
> allocate out of the "guaranteed" pool first and only when that's
> exhausted fall back to doing dynamic allocations, then we probably want
> to keep the static list of entries.
> 
> If we want to allocate dynamically most of the time and only dip into
> the "guaranteed" pool when that can't be satisfied, then basing all of
> this around mempools might make more sense.

Thanks for the review - I've learned a lot about mempools from this discussion.

I've noticed that a lot of the time, (at least for TCP) rpc_slots are allocated and not used.
The default slot entries value is a guess. This is why I like the idea of dynamically allocating most 
of the time, and only dipping into the guaranteed pool when the dynamic allocation
can't be satisfied. e.g. get rid of the static set of slots.
Does this design require two mempools, one for dynamic allocation and a guaranteed one being small and "in reserve"?

For UDP and RDMA, the default # of slot entries can turn into an upper bound.
For TCP, the default # of slot entries would have no meaning WRT upper bound, as the upper bound is determined
by the TCP window size.

We should actually free the slots in xprt_free_slot to return the slot to the mempool.

-->Andy
> 
> -- 
> Jeff Layton <jlayton@redhat.com>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton May 4, 2011, 2:59 p.m. UTC | #9
On Mon,  2 May 2011 21:40:08 -0400
andros@netapp.com wrote:

> From: Andy Adamson <andros@netapp.com>
> 
> Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
> can fully utilize the negotiated TCP window.
> 
> Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
> cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
> rpc_xprt allocation.
> 
> Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
> For TCP, trigger a dyamic slot allocation in response to a write_space
> callback which is in turn called when the TCP layer is waiting for buffer space.
> 
> Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
> allocator uses GFP_NOWAIT and will return without allocating a slot if
> GFP_NOWAIT allocation fails. This is OK because the write_space callback will
> be called again, and the dynamic slot allocator can retry.
> 
> Signed-off-by: Andy Adamson <andros@netap.com>
> ---
>  include/linux/sunrpc/sched.h |    2 +
>  include/linux/sunrpc/xprt.h  |    6 +++-
>  net/sunrpc/clnt.c            |    4 ++
>  net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
>  net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
>  net/sunrpc/xprtsock.c        |    1 +
>  6 files changed, 117 insertions(+), 10 deletions(-)
> 

[...]

> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index 9494c37..1b0aa55 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -498,6 +498,7 @@ void xprt_write_space(struct rpc_xprt *xprt)
>  		dprintk("RPC:       write space: waking waiting task on "
>  				"xprt %p\n", xprt);
>  		rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
> +		set_bit(XPRT_WRITE_SPACE, &xprt->state);
>  	}
>  	spin_unlock_bh(&xprt->transport_lock);
>  }
> @@ -957,6 +958,66 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
>  	spin_unlock(&xprt->reserve_lock);
>  }
>  

Is the above a potential race as well? I looks possible (if unlikely)
that we'd wake up the task, and it find that XPRT_WRITE_SPACE is still
unset. Would it make more sense to do the set_bit call before waking up
the task?
Jeff Layton May 4, 2011, 3:08 p.m. UTC | #10
On Mon,  2 May 2011 21:40:08 -0400
andros@netapp.com wrote:

> +	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
> +		return;

Also, I'm not sure that a single bit really conveys enough information
for this.

IIUC, sk_write_space gets called when a packet is TCP ACK'ed. It seems
possible that we would sometimes have buffer space available to queue
the packet without sk_write_space being called. With this, we'll
basically be serializing all dynamic slot allocations behind the
sk_write_space callbacks.

Consider the case of many small TCP frames being sent after a large one
just got ACK'ed. Only one would be allowed to be sent, even though
there might be enough send buffer space to allow for more.

Would it instead make more sense to base this on the amount of space
available in the actual socket rather than this bit?
Andy Adamson May 4, 2011, 3:10 p.m. UTC | #11
On May 4, 2011, at 10:59 AM, Jeff Layton wrote:

> On Mon,  2 May 2011 21:40:08 -0400
> andros@netapp.com wrote:
> 
>> From: Andy Adamson <andros@netapp.com>
>> 
>> Hookup TCP congestion feedback into rpc_slot allocation so that the RPC layer
>> can fully utilize the negotiated TCP window.
>> 
>> Use a slab cache for rpc_slots. Statically allocate an rpc_xprt rpc_slot slab
>> cache using GFP_KERNEL to the RPC_DEF_SLOT_TABLE number of slots at
>> rpc_xprt allocation.
>> 
>> Add a dynamic rpc slot allocator to rpc_xprt_ops which is set only for TCP.
>> For TCP, trigger a dyamic slot allocation in response to a write_space
>> callback which is in turn called when the TCP layer is waiting for buffer space.
>> 
>> Dynamically add a slot at the beginning of the RPC call_transmit state. The slot
>> allocator uses GFP_NOWAIT and will return without allocating a slot if
>> GFP_NOWAIT allocation fails. This is OK because the write_space callback will
>> be called again, and the dynamic slot allocator can retry.
>> 
>> Signed-off-by: Andy Adamson <andros@netap.com>
>> ---
>> include/linux/sunrpc/sched.h |    2 +
>> include/linux/sunrpc/xprt.h  |    6 +++-
>> net/sunrpc/clnt.c            |    4 ++
>> net/sunrpc/sched.c           |   39 ++++++++++++++++++++++
>> net/sunrpc/xprt.c            |   75 +++++++++++++++++++++++++++++++++++++-----
>> net/sunrpc/xprtsock.c        |    1 +
>> 6 files changed, 117 insertions(+), 10 deletions(-)
>> 
> 
> [...]
> 
>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>> index 9494c37..1b0aa55 100644
>> --- a/net/sunrpc/xprt.c
>> +++ b/net/sunrpc/xprt.c
>> @@ -498,6 +498,7 @@ void xprt_write_space(struct rpc_xprt *xprt)
>> 		dprintk("RPC:       write space: waking waiting task on "
>> 				"xprt %p\n", xprt);
>> 		rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
>> +		set_bit(XPRT_WRITE_SPACE, &xprt->state);
>> 	}
>> 	spin_unlock_bh(&xprt->transport_lock);
>> }
>> @@ -957,6 +958,66 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
>> 	spin_unlock(&xprt->reserve_lock);
>> }
>> 
> 
> Is the above a potential race as well? I looks possible (if unlikely)
> that we'd wake up the task, and it find that XPRT_WRITE_SPACE is still
> unset. Would it make more sense to do the set_bit call before waking up
> the task?

Yes - good catch.

-->Andy
> 
> -- 
> Jeff Layton <jlayton@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton May 4, 2011, 3:18 p.m. UTC | #12
On Wed, 4 May 2011 10:54:57 -0400
Andy Adamson <andros@netapp.com> wrote:

> 
> On May 4, 2011, at 7:54 AM, Jeff Layton wrote:
> 
> > On Wed, 4 May 2011 12:07:26 +1000
> > NeilBrown <neilb@suse.de> wrote:
> > 
> >> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
> >> <Trond.Myklebust@netapp.com> wrote:
> >> 
> >>> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
> >>>> For rpc slots, I doubt you need mempools at all.
> >>>> Mempools are only needed if failure is not an option and you would rather
> >>>> wait, but you cannot wait for regular writeback because you are on the
> >>>> writeback path.  So you use a mempool which waits for a previous request to
> >>>> complete.  I don't think that describes rpc slots at all.
> >>>> For rpc slots, you can afford to wait when setting up the transport, as you
> >>>> are not on the writeout path yet, and later you can always cope with failure.
> >>>> So just use kmalloc.
> >>> 
> >>> Errr.... No. By the time you get to allocating an RPC slot, you may be
> >>> bang smack in the middle of the writeout path.
> >>> 
> >>> The scenario would be that something triggers a writeback (kswapd,
> >>> direct reclaim,...) which triggers an RPC call, which requires you to
> >>> allocate at least one rpc slot before you can put the write on the wire.
> >>> 
> >>> I agree with your assertion that we only need one successful slot
> >>> allocation in order to make overall forward progress, but we definitely
> >>> do need that one...
> >>> 
> >> 
> >> Probably I misunderstood the code, but I thought that there was a base set of
> >> slots preallocated.  Eventually one of those will become free won't it?
> >> 
> >> Looking at the code again, it only ever returns entries to the mempool when
> >> it is about to destroy the xprt.  That makes no sense.  If you are using a
> >> mempool, then you allocate when you need to use space, and free it as soon as
> >> you have finished with it, so the next request can get a turn.
> >> 
> > 
> > That was sort of my point, though I didn't articulate it very well.
> > 
> > I don't think we want to keep this "static" set of rqsts and the
> > mempool. It seems like that's somewhat redundant and more complex than
> > is warranted. The question is...which should we keep?
> > 
> > I think it sort of depends on what the intended model is. If we want to
> > allocate out of the "guaranteed" pool first and only when that's
> > exhausted fall back to doing dynamic allocations, then we probably want
> > to keep the static list of entries.
> > 
> > If we want to allocate dynamically most of the time and only dip into
> > the "guaranteed" pool when that can't be satisfied, then basing all of
> > this around mempools might make more sense.
> 
> Thanks for the review - I've learned a lot about mempools from this discussion.
> 
> I've noticed that a lot of the time, (at least for TCP) rpc_slots are allocated and not used.
> The default slot entries value is a guess. This is why I like the idea of dynamically allocating most 
> of the time, and only dipping into the guaranteed pool when the dynamic allocation
> can't be satisfied. e.g. get rid of the static set of slots.
> Does this design require two mempools, one for dynamic allocation and a guaranteed one being small and "in reserve"?
> 

No, a single mempool would give you that. Just declare the mempool with
the amount of "emergency" slots you want, and then allocate directly
from it and then free them back to the mempool. Most of the time (as
Neil points out) you'll get them allocated from the slabcache. It'll
it'll only dip into the reserved set when those can't be allocated.

The other thing you have to determine is whether you want this
guaranteed number of slots to be per-xprt or global. Today, it's
per-xprt. Making it global might mean less wasted memory overall, but I
could forsee situations where a hung mount could starve other mounts of
slots.

Figuring that out might help guide the overall design here...

> For UDP and RDMA, the default # of slot entries can turn into an upper bound.
> For TCP, the default # of slot entries would have no meaning WRT upper bound, as the upper bound is determined
> by the TCP window size.
> 

Maybe rather than having this dynamic_slot_alloc operation that's only
conditionally called, it would cleaner to abstract out the slot
allocator altogether.

UDP and RDMA could use the same routine and just allocate out of a
fixed set of preallocated slots (sort of like how they do today), and
TCP could use this new scheme.
Andy Adamson May 4, 2011, 3:20 p.m. UTC | #13
On May 4, 2011, at 11:08 AM, Jeff Layton wrote:

> On Mon,  2 May 2011 21:40:08 -0400
> andros@netapp.com wrote:
> 
>> +	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
>> +		return;
> 
> Also, I'm not sure that a single bit really conveys enough information
> for this.
> 
> IIUC, sk_write_space gets called when a packet is TCP ACK'ed. It seems
> possible that we would sometimes have buffer space available to queue
> the packet without sk_write_space being called. With this, we'll
> basically be serializing all dynamic slot allocations behind the
> sk_write_space callbacks.

Which I thought was OK given that the TCP window takes a while to stabilize.

> 
> Consider the case of many small TCP frames being sent after a large one
> just got ACK'ed. Only one would be allowed to be sent, even though
> there might be enough send buffer space to allow for more.
> 
> Would it instead make more sense to base this on the amount of space
> available in the actual socket rather than this bit?

So at each write_space, potentially allocate more than one rpc_slot as opposed
to allocating one rpc_slot and waiting for the next write_space? I could look at this
with the 10G testiing.

-->Andy

> 
> -- 
> Jeff Layton <jlayton@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust May 4, 2011, 3:30 p.m. UTC | #14
On Wed, 2011-05-04 at 11:18 -0400, Jeff Layton wrote:
> The other thing you have to determine is whether you want this
> guaranteed number of slots to be per-xprt or global. Today, it's
> per-xprt. Making it global might mean less wasted memory overall, but I
> could forsee situations where a hung mount could starve other mounts of
> slots.

The hung mount scenario is one that we're not going to deal too well
with anyway: both rpc_tasks and rpc buffers are allocated from global
mempools (as are the struct nfs_read_data/nfs_write_data)
Jeff Layton May 4, 2011, 3:31 p.m. UTC | #15
On Wed, 4 May 2011 11:20:59 -0400
Andy Adamson <andros@netapp.com> wrote:

> 
> On May 4, 2011, at 11:08 AM, Jeff Layton wrote:
> 
> > On Mon,  2 May 2011 21:40:08 -0400
> > andros@netapp.com wrote:
> > 
> >> +	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
> >> +		return;
> > 
> > Also, I'm not sure that a single bit really conveys enough information
> > for this.
> > 
> > IIUC, sk_write_space gets called when a packet is TCP ACK'ed. It seems
> > possible that we would sometimes have buffer space available to queue
> > the packet without sk_write_space being called. With this, we'll
> > basically be serializing all dynamic slot allocations behind the
> > sk_write_space callbacks.
> 
> Which I thought was OK given that the TCP window takes a while to stabilize.
> 

Hmm, ok. I hadn't considered that, but you may be correct.

> > 
> > Consider the case of many small TCP frames being sent after a large one
> > just got ACK'ed. Only one would be allowed to be sent, even though
> > there might be enough send buffer space to allow for more.
> > 
> > Would it instead make more sense to base this on the amount of space
> > available in the actual socket rather than this bit?
> 
> So at each write_space, potentially allocate more than one rpc_slot as opposed
> to allocating one rpc_slot and waiting for the next write_space? I could look at this
> with the 10G testiing.
> 

xs_tcp_write_space has this:

        /* from net/core/stream.c:sk_stream_write_space */
        if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
                xs_write_space(sk);

...my (hand-wavy) thinking was that it might make sense to check
against this value again in the slot allocation routine.

IOW, allow allocation of a slot if there's reasonably enough buffer
space to do a send instead of allowing just one per xs_write_space
callback.

Of course, that gets complicated if there are a bunch of slot
allocations in parallel. You'd probably need to offset the amount of
available space by a certain amount for each slot that has been
allocated but not yet sent.

Perhaps though that's too complicated to be really useful...
Trond Myklebust May 4, 2011, 3:35 p.m. UTC | #16
On Wed, 2011-05-04 at 11:20 -0400, Andy Adamson wrote:
> On May 4, 2011, at 11:08 AM, Jeff Layton wrote:
> 
> > On Mon,  2 May 2011 21:40:08 -0400
> > andros@netapp.com wrote:
> > 
> >> +	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
> >> +		return;
> > 
> > Also, I'm not sure that a single bit really conveys enough information
> > for this.
> > 
> > IIUC, sk_write_space gets called when a packet is TCP ACK'ed. It seems
> > possible that we would sometimes have buffer space available to queue
> > the packet without sk_write_space being called. With this, we'll
> > basically be serializing all dynamic slot allocations behind the
> > sk_write_space callbacks.
> 
> Which I thought was OK given that the TCP window takes a while to stabilize.
> 
> > 
> > Consider the case of many small TCP frames being sent after a large one
> > just got ACK'ed. Only one would be allowed to be sent, even though
> > there might be enough send buffer space to allow for more.
> > 
> > Would it instead make more sense to base this on the amount of space
> > available in the actual socket rather than this bit?
> 
> So at each write_space, potentially allocate more than one rpc_slot as opposed
> to allocating one rpc_slot and waiting for the next write_space? I could look at this
> with the 10G testiing.

Why? You can't send that data. Once you hit the write space limit, then
the socket remains blocked until you get the callback. It doesn't matter
how small the frame, you will not be allowed to send more data.

On the other hand, we do set the SOCK_NOSPACE bit, which means that the
socket layer will attempt to grow the TCP window even though we're not
actually putting more data into the socket.
Andy Adamson May 4, 2011, 3:52 p.m. UTC | #17
On May 4, 2011, at 11:18 AM, Jeff Layton wrote:

> On Wed, 4 May 2011 10:54:57 -0400
> Andy Adamson <andros@netapp.com> wrote:
> 
>> 
>> On May 4, 2011, at 7:54 AM, Jeff Layton wrote:
>> 
>>> On Wed, 4 May 2011 12:07:26 +1000
>>> NeilBrown <neilb@suse.de> wrote:
>>> 
>>>> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
>>>> <Trond.Myklebust@netapp.com> wrote:
>>>> 
>>>>> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
>>>>>> For rpc slots, I doubt you need mempools at all.
>>>>>> Mempools are only needed if failure is not an option and you would rather
>>>>>> wait, but you cannot wait for regular writeback because you are on the
>>>>>> writeback path.  So you use a mempool which waits for a previous request to
>>>>>> complete.  I don't think that describes rpc slots at all.
>>>>>> For rpc slots, you can afford to wait when setting up the transport, as you
>>>>>> are not on the writeout path yet, and later you can always cope with failure.
>>>>>> So just use kmalloc.
>>>>> 
>>>>> Errr.... No. By the time you get to allocating an RPC slot, you may be
>>>>> bang smack in the middle of the writeout path.
>>>>> 
>>>>> The scenario would be that something triggers a writeback (kswapd,
>>>>> direct reclaim,...) which triggers an RPC call, which requires you to
>>>>> allocate at least one rpc slot before you can put the write on the wire.
>>>>> 
>>>>> I agree with your assertion that we only need one successful slot
>>>>> allocation in order to make overall forward progress, but we definitely
>>>>> do need that one...
>>>>> 
>>>> 
>>>> Probably I misunderstood the code, but I thought that there was a base set of
>>>> slots preallocated.  Eventually one of those will become free won't it?
>>>> 
>>>> Looking at the code again, it only ever returns entries to the mempool when
>>>> it is about to destroy the xprt.  That makes no sense.  If you are using a
>>>> mempool, then you allocate when you need to use space, and free it as soon as
>>>> you have finished with it, so the next request can get a turn.
>>>> 
>>> 
>>> That was sort of my point, though I didn't articulate it very well.
>>> 
>>> I don't think we want to keep this "static" set of rqsts and the
>>> mempool. It seems like that's somewhat redundant and more complex than
>>> is warranted. The question is...which should we keep?
>>> 
>>> I think it sort of depends on what the intended model is. If we want to
>>> allocate out of the "guaranteed" pool first and only when that's
>>> exhausted fall back to doing dynamic allocations, then we probably want
>>> to keep the static list of entries.
>>> 
>>> If we want to allocate dynamically most of the time and only dip into
>>> the "guaranteed" pool when that can't be satisfied, then basing all of
>>> this around mempools might make more sense.
>> 
>> Thanks for the review - I've learned a lot about mempools from this discussion.
>> 
>> I've noticed that a lot of the time, (at least for TCP) rpc_slots are allocated and not used.
>> The default slot entries value is a guess. This is why I like the idea of dynamically allocating most 
>> of the time, and only dipping into the guaranteed pool when the dynamic allocation
>> can't be satisfied. e.g. get rid of the static set of slots.
>> Does this design require two mempools, one for dynamic allocation and a guaranteed one being small and "in reserve"?
>> 
> 
> No, a single mempool would give you that. Just declare the mempool with
> the amount of "emergency" slots you want, and then allocate directly
> from it and then free them back to the mempool. Most of the time (as
> Neil points out) you'll get them allocated from the slabcache. It'll
> it'll only dip into the reserved set when those can't be allocated.

Cool.

> 
> The other thing you have to determine is whether you want this
> guaranteed number of slots to be per-xprt or global. Today, it's
> per-xprt. Making it global might mean less wasted memory overall, but I
> could forsee situations where a hung mount could starve other mounts of
> slots.
> 
> Figuring that out might help guide the overall design here...
> 
>> For UDP and RDMA, the default # of slot entries can turn into an upper bound.
>> For TCP, the default # of slot entries would have no meaning WRT upper bound, as the upper bound is determined
>> by the TCP window size.
>> 
> 
> Maybe rather than having this dynamic_slot_alloc operation that's only
> conditionally called, it would cleaner to abstract out the slot
> allocator altogether.
> 
> UDP and RDMA could use the same routine and just allocate out of a
> fixed set of preallocated slots (sort of like how they do today), and
> TCP could use this new scheme.

We need rpc_slots (UDP/RDMA/TCP/....) to be freed back to the mempool in xprt_free_slot.
If we allocate the set # of rpc_slots, place them on the xprt->free list, and they don't get used,
then they won't be freed until the xprt is destroyed.

I was thinking of using the same allocator for UDP/RDMA/TCP. With all transports, start off with a 
minimal # of pre-allocated slots (6? 4? 2?) and for UDP/RDMA trigger a dynamic allocation in xprt_alloc_slot
when there are no slots available and the # slots allocated is less than the default # slot entries for that transport.

-->Andy

> -- 
> Jeff Layton <jlayton@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever May 4, 2011, 4:01 p.m. UTC | #18
On May 4, 2011, at 11:52 AM, Andy Adamson wrote:

> 
> On May 4, 2011, at 11:18 AM, Jeff Layton wrote:
> 
>> On Wed, 4 May 2011 10:54:57 -0400
>> Andy Adamson <andros@netapp.com> wrote:
>> 
>>> 
>>> On May 4, 2011, at 7:54 AM, Jeff Layton wrote:
>>> 
>>>> On Wed, 4 May 2011 12:07:26 +1000
>>>> NeilBrown <neilb@suse.de> wrote:
>>>> 
>>>>> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
>>>>> <Trond.Myklebust@netapp.com> wrote:
>>>>> 
>>>>>> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
>>>>>>> For rpc slots, I doubt you need mempools at all.
>>>>>>> Mempools are only needed if failure is not an option and you would rather
>>>>>>> wait, but you cannot wait for regular writeback because you are on the
>>>>>>> writeback path.  So you use a mempool which waits for a previous request to
>>>>>>> complete.  I don't think that describes rpc slots at all.
>>>>>>> For rpc slots, you can afford to wait when setting up the transport, as you
>>>>>>> are not on the writeout path yet, and later you can always cope with failure.
>>>>>>> So just use kmalloc.
>>>>>> 
>>>>>> Errr.... No. By the time you get to allocating an RPC slot, you may be
>>>>>> bang smack in the middle of the writeout path.
>>>>>> 
>>>>>> The scenario would be that something triggers a writeback (kswapd,
>>>>>> direct reclaim,...) which triggers an RPC call, which requires you to
>>>>>> allocate at least one rpc slot before you can put the write on the wire.
>>>>>> 
>>>>>> I agree with your assertion that we only need one successful slot
>>>>>> allocation in order to make overall forward progress, but we definitely
>>>>>> do need that one...
>>>>>> 
>>>>> 
>>>>> Probably I misunderstood the code, but I thought that there was a base set of
>>>>> slots preallocated.  Eventually one of those will become free won't it?
>>>>> 
>>>>> Looking at the code again, it only ever returns entries to the mempool when
>>>>> it is about to destroy the xprt.  That makes no sense.  If you are using a
>>>>> mempool, then you allocate when you need to use space, and free it as soon as
>>>>> you have finished with it, so the next request can get a turn.
>>>>> 
>>>> 
>>>> That was sort of my point, though I didn't articulate it very well.
>>>> 
>>>> I don't think we want to keep this "static" set of rqsts and the
>>>> mempool. It seems like that's somewhat redundant and more complex than
>>>> is warranted. The question is...which should we keep?
>>>> 
>>>> I think it sort of depends on what the intended model is. If we want to
>>>> allocate out of the "guaranteed" pool first and only when that's
>>>> exhausted fall back to doing dynamic allocations, then we probably want
>>>> to keep the static list of entries.
>>>> 
>>>> If we want to allocate dynamically most of the time and only dip into
>>>> the "guaranteed" pool when that can't be satisfied, then basing all of
>>>> this around mempools might make more sense.
>>> 
>>> Thanks for the review - I've learned a lot about mempools from this discussion.
>>> 
>>> I've noticed that a lot of the time, (at least for TCP) rpc_slots are allocated and not used.
>>> The default slot entries value is a guess. This is why I like the idea of dynamically allocating most 
>>> of the time, and only dipping into the guaranteed pool when the dynamic allocation
>>> can't be satisfied. e.g. get rid of the static set of slots.
>>> Does this design require two mempools, one for dynamic allocation and a guaranteed one being small and "in reserve"?
>>> 
>> 
>> No, a single mempool would give you that. Just declare the mempool with
>> the amount of "emergency" slots you want, and then allocate directly
>> from it and then free them back to the mempool. Most of the time (as
>> Neil points out) you'll get them allocated from the slabcache. It'll
>> it'll only dip into the reserved set when those can't be allocated.
> 
> Cool.
> 
>> 
>> The other thing you have to determine is whether you want this
>> guaranteed number of slots to be per-xprt or global. Today, it's
>> per-xprt. Making it global might mean less wasted memory overall, but I
>> could forsee situations where a hung mount could starve other mounts of
>> slots.
>> 
>> Figuring that out might help guide the overall design here...
>> 
>>> For UDP and RDMA, the default # of slot entries can turn into an upper bound.
>>> For TCP, the default # of slot entries would have no meaning WRT upper bound, as the upper bound is determined
>>> by the TCP window size.
>>> 
>> 
>> Maybe rather than having this dynamic_slot_alloc operation that's only
>> conditionally called, it would cleaner to abstract out the slot
>> allocator altogether.
>> 
>> UDP and RDMA could use the same routine and just allocate out of a
>> fixed set of preallocated slots (sort of like how they do today), and
>> TCP could use this new scheme.
> 
> We need rpc_slots (UDP/RDMA/TCP/....) to be freed back to the mempool in xprt_free_slot.
> If we allocate the set # of rpc_slots, place them on the xprt->free list, and they don't get used,
> then they won't be freed until the xprt is destroyed.

The reason to keep slots around and on a local free list is that full-out memory allocation is heavy weight.  I keep thinking of the talk Willy gave at LSF about how much scalability was affected in the SCSI I/O path by simple memory allocation operations.  Is using a SLAB or SLUB as fast as what we have today?

> I was thinking of using the same allocator for UDP/RDMA/TCP. With all transports, start off with a 
> minimal # of pre-allocated slots (6? 4? 2?) and for UDP/RDMA trigger a dynamic allocation in xprt_alloc_slot
> when there are no slots available and the # slots allocated is less than the default # slot entries for that transport.
> 
> -->Andy
> 
>> -- 
>> Jeff Layton <jlayton@redhat.com>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Adamson May 4, 2011, 5:22 p.m. UTC | #19
On May 4, 2011, at 12:01 PM, Chuck Lever wrote:

> 
> On May 4, 2011, at 11:52 AM, Andy Adamson wrote:
> 
>> 
>> On May 4, 2011, at 11:18 AM, Jeff Layton wrote:
>> 
>>> On Wed, 4 May 2011 10:54:57 -0400
>>> Andy Adamson <andros@netapp.com> wrote:
>>> 
>>>> 
>>>> On May 4, 2011, at 7:54 AM, Jeff Layton wrote:
>>>> 
>>>>> On Wed, 4 May 2011 12:07:26 +1000
>>>>> NeilBrown <neilb@suse.de> wrote:
>>>>> 
>>>>>> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
>>>>>> <Trond.Myklebust@netapp.com> wrote:
>>>>>> 
>>>>>>> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
>>>>>>>> For rpc slots, I doubt you need mempools at all.
>>>>>>>> Mempools are only needed if failure is not an option and you would rather
>>>>>>>> wait, but you cannot wait for regular writeback because you are on the
>>>>>>>> writeback path.  So you use a mempool which waits for a previous request to
>>>>>>>> complete.  I don't think that describes rpc slots at all.
>>>>>>>> For rpc slots, you can afford to wait when setting up the transport, as you
>>>>>>>> are not on the writeout path yet, and later you can always cope with failure.
>>>>>>>> So just use kmalloc.
>>>>>>> 
>>>>>>> Errr.... No. By the time you get to allocating an RPC slot, you may be
>>>>>>> bang smack in the middle of the writeout path.
>>>>>>> 
>>>>>>> The scenario would be that something triggers a writeback (kswapd,
>>>>>>> direct reclaim,...) which triggers an RPC call, which requires you to
>>>>>>> allocate at least one rpc slot before you can put the write on the wire.
>>>>>>> 
>>>>>>> I agree with your assertion that we only need one successful slot
>>>>>>> allocation in order to make overall forward progress, but we definitely
>>>>>>> do need that one...
>>>>>>> 
>>>>>> 
>>>>>> Probably I misunderstood the code, but I thought that there was a base set of
>>>>>> slots preallocated.  Eventually one of those will become free won't it?
>>>>>> 
>>>>>> Looking at the code again, it only ever returns entries to the mempool when
>>>>>> it is about to destroy the xprt.  That makes no sense.  If you are using a
>>>>>> mempool, then you allocate when you need to use space, and free it as soon as
>>>>>> you have finished with it, so the next request can get a turn.
>>>>>> 
>>>>> 
>>>>> That was sort of my point, though I didn't articulate it very well.
>>>>> 
>>>>> I don't think we want to keep this "static" set of rqsts and the
>>>>> mempool. It seems like that's somewhat redundant and more complex than
>>>>> is warranted. The question is...which should we keep?
>>>>> 
>>>>> I think it sort of depends on what the intended model is. If we want to
>>>>> allocate out of the "guaranteed" pool first and only when that's
>>>>> exhausted fall back to doing dynamic allocations, then we probably want
>>>>> to keep the static list of entries.
>>>>> 
>>>>> If we want to allocate dynamically most of the time and only dip into
>>>>> the "guaranteed" pool when that can't be satisfied, then basing all of
>>>>> this around mempools might make more sense.
>>>> 
>>>> Thanks for the review - I've learned a lot about mempools from this discussion.
>>>> 
>>>> I've noticed that a lot of the time, (at least for TCP) rpc_slots are allocated and not used.
>>>> The default slot entries value is a guess. This is why I like the idea of dynamically allocating most 
>>>> of the time, and only dipping into the guaranteed pool when the dynamic allocation
>>>> can't be satisfied. e.g. get rid of the static set of slots.
>>>> Does this design require two mempools, one for dynamic allocation and a guaranteed one being small and "in reserve"?
>>>> 
>>> 
>>> No, a single mempool would give you that. Just declare the mempool with
>>> the amount of "emergency" slots you want, and then allocate directly
>>> from it and then free them back to the mempool. Most of the time (as
>>> Neil points out) you'll get them allocated from the slabcache. It'll
>>> it'll only dip into the reserved set when those can't be allocated.
>> 
>> Cool.
>> 
>>> 
>>> The other thing you have to determine is whether you want this
>>> guaranteed number of slots to be per-xprt or global. Today, it's
>>> per-xprt. Making it global might mean less wasted memory overall, but I
>>> could forsee situations where a hung mount could starve other mounts of
>>> slots.
>>> 
>>> Figuring that out might help guide the overall design here...
>>> 
>>>> For UDP and RDMA, the default # of slot entries can turn into an upper bound.
>>>> For TCP, the default # of slot entries would have no meaning WRT upper bound, as the upper bound is determined
>>>> by the TCP window size.
>>>> 
>>> 
>>> Maybe rather than having this dynamic_slot_alloc operation that's only
>>> conditionally called, it would cleaner to abstract out the slot
>>> allocator altogether.
>>> 
>>> UDP and RDMA could use the same routine and just allocate out of a
>>> fixed set of preallocated slots (sort of like how they do today), and
>>> TCP could use this new scheme.
>> 
>> We need rpc_slots (UDP/RDMA/TCP/....) to be freed back to the mempool in xprt_free_slot.
>> If we allocate the set # of rpc_slots, place them on the xprt->free list, and they don't get used,
>> then they won't be freed until the xprt is destroyed.
> 
> The reason to keep slots around and on a local free list is that full-out memory allocation is heavy weight.  I keep thinking of the talk Willy gave at LSF about how much scalability was affected in the SCSI I/O path by simple memory allocation operations.  Is using a SLAB or SLUB as fast as what we have today?

That is a good question. We could implement this same slot allocation scheme without a SLAB/SLUB - just use kmalloc and leave xprt_free_slot to just move the allocated slots to the free list as is done today. I don't have any information on a performance hit. 

Is this worth testing, or should we just stick with kmalloc and friends?

-->Andy
> 
>> I was thinking of using the same allocator for UDP/RDMA/TCP. With all transports, start off with a 
>> minimal # of pre-allocated slots (6? 4? 2?) and for UDP/RDMA trigger a dynamic allocation in xprt_alloc_slot
>> when there are no slots available and the # slots allocated is less than the default # slot entries for that transport.
>> 
>> -->Andy
>> 
>>> -- 
>>> Jeff Layton <jlayton@redhat.com>
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> -- 
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton May 5, 2011, 11:47 a.m. UTC | #20
On Wed, 04 May 2011 11:35:34 -0400
Trond Myklebust <Trond.Myklebust@netapp.com> wrote:

> On Wed, 2011-05-04 at 11:20 -0400, Andy Adamson wrote:
> > On May 4, 2011, at 11:08 AM, Jeff Layton wrote:
> > 
> > > On Mon,  2 May 2011 21:40:08 -0400
> > > andros@netapp.com wrote:
> > > 
> > >> +	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
> > >> +		return;
> > > 
> > > Also, I'm not sure that a single bit really conveys enough information
> > > for this.
> > > 
> > > IIUC, sk_write_space gets called when a packet is TCP ACK'ed. It seems
> > > possible that we would sometimes have buffer space available to queue
> > > the packet without sk_write_space being called. With this, we'll
> > > basically be serializing all dynamic slot allocations behind the
> > > sk_write_space callbacks.
> > 
> > Which I thought was OK given that the TCP window takes a while to stabilize.
> > 
> > > 
> > > Consider the case of many small TCP frames being sent after a large one
> > > just got ACK'ed. Only one would be allowed to be sent, even though
> > > there might be enough send buffer space to allow for more.
> > > 
> > > Would it instead make more sense to base this on the amount of space
> > > available in the actual socket rather than this bit?
> > 
> > So at each write_space, potentially allocate more than one rpc_slot as opposed
> > to allocating one rpc_slot and waiting for the next write_space? I could look at this
> > with the 10G testiing.
> 
> Why? You can't send that data. Once you hit the write space limit, then
> the socket remains blocked until you get the callback. It doesn't matter
> how small the frame, you will not be allowed to send more data.
> 
> On the other hand, we do set the SOCK_NOSPACE bit, which means that the
> socket layer will attempt to grow the TCP window even though we're not
> actually putting more data into the socket.
> 

I'm not sure I understand what you're suggesting here.

I guess my main point is that a single bit that we flip on in
write_space and flip off when a slot is allocated doesn't carry enough
info. That scheme will also be subject to subtle differences in timing.
For instance...

Suppose a large number of TCP ACKs come in all at around the same time.
write_space gets called a bunch of times in succession, so the bit gets
"set" several times. Several queued tasks get woken up but only one can
clear the bit so only one gets a slot.

However, if those acks come in with enough of a delay between them, then
you can potentially get one slot allocated per write_space callback.

I think we ought to consider a heuristic that doesn't rely on the
frequency and timing of write_space callbacks.
Jeff Layton May 5, 2011, 12:05 p.m. UTC | #21
On Wed, 4 May 2011 13:22:01 -0400
Andy Adamson <andros@netapp.com> wrote:

> 
> On May 4, 2011, at 12:01 PM, Chuck Lever wrote:
> 
> > 
> > On May 4, 2011, at 11:52 AM, Andy Adamson wrote:
> > 
> >> 
> >> On May 4, 2011, at 11:18 AM, Jeff Layton wrote:
> >> 
> >>> On Wed, 4 May 2011 10:54:57 -0400
> >>> Andy Adamson <andros@netapp.com> wrote:
> >>> 
> >>>> 
> >>>> On May 4, 2011, at 7:54 AM, Jeff Layton wrote:
> >>>> 
> >>>>> On Wed, 4 May 2011 12:07:26 +1000
> >>>>> NeilBrown <neilb@suse.de> wrote:
> >>>>> 
> >>>>>> On Tue, 03 May 2011 21:46:03 -0400 Trond Myklebust
> >>>>>> <Trond.Myklebust@netapp.com> wrote:
> >>>>>> 
> >>>>>>> On Wed, 2011-05-04 at 11:18 +1000, NeilBrown wrote:
> >>>>>>>> For rpc slots, I doubt you need mempools at all.
> >>>>>>>> Mempools are only needed if failure is not an option and you would rather
> >>>>>>>> wait, but you cannot wait for regular writeback because you are on the
> >>>>>>>> writeback path.  So you use a mempool which waits for a previous request to
> >>>>>>>> complete.  I don't think that describes rpc slots at all.
> >>>>>>>> For rpc slots, you can afford to wait when setting up the transport, as you
> >>>>>>>> are not on the writeout path yet, and later you can always cope with failure.
> >>>>>>>> So just use kmalloc.
> >>>>>>> 
> >>>>>>> Errr.... No. By the time you get to allocating an RPC slot, you may be
> >>>>>>> bang smack in the middle of the writeout path.
> >>>>>>> 
> >>>>>>> The scenario would be that something triggers a writeback (kswapd,
> >>>>>>> direct reclaim,...) which triggers an RPC call, which requires you to
> >>>>>>> allocate at least one rpc slot before you can put the write on the wire.
> >>>>>>> 
> >>>>>>> I agree with your assertion that we only need one successful slot
> >>>>>>> allocation in order to make overall forward progress, but we definitely
> >>>>>>> do need that one...
> >>>>>>> 
> >>>>>> 
> >>>>>> Probably I misunderstood the code, but I thought that there was a base set of
> >>>>>> slots preallocated.  Eventually one of those will become free won't it?
> >>>>>> 
> >>>>>> Looking at the code again, it only ever returns entries to the mempool when
> >>>>>> it is about to destroy the xprt.  That makes no sense.  If you are using a
> >>>>>> mempool, then you allocate when you need to use space, and free it as soon as
> >>>>>> you have finished with it, so the next request can get a turn.
> >>>>>> 
> >>>>> 
> >>>>> That was sort of my point, though I didn't articulate it very well.
> >>>>> 
> >>>>> I don't think we want to keep this "static" set of rqsts and the
> >>>>> mempool. It seems like that's somewhat redundant and more complex than
> >>>>> is warranted. The question is...which should we keep?
> >>>>> 
> >>>>> I think it sort of depends on what the intended model is. If we want to
> >>>>> allocate out of the "guaranteed" pool first and only when that's
> >>>>> exhausted fall back to doing dynamic allocations, then we probably want
> >>>>> to keep the static list of entries.
> >>>>> 
> >>>>> If we want to allocate dynamically most of the time and only dip into
> >>>>> the "guaranteed" pool when that can't be satisfied, then basing all of
> >>>>> this around mempools might make more sense.
> >>>> 
> >>>> Thanks for the review - I've learned a lot about mempools from this discussion.
> >>>> 
> >>>> I've noticed that a lot of the time, (at least for TCP) rpc_slots are allocated and not used.
> >>>> The default slot entries value is a guess. This is why I like the idea of dynamically allocating most 
> >>>> of the time, and only dipping into the guaranteed pool when the dynamic allocation
> >>>> can't be satisfied. e.g. get rid of the static set of slots.
> >>>> Does this design require two mempools, one for dynamic allocation and a guaranteed one being small and "in reserve"?
> >>>> 
> >>> 
> >>> No, a single mempool would give you that. Just declare the mempool with
> >>> the amount of "emergency" slots you want, and then allocate directly
> >>> from it and then free them back to the mempool. Most of the time (as
> >>> Neil points out) you'll get them allocated from the slabcache. It'll
> >>> it'll only dip into the reserved set when those can't be allocated.
> >> 
> >> Cool.
> >> 
> >>> 
> >>> The other thing you have to determine is whether you want this
> >>> guaranteed number of slots to be per-xprt or global. Today, it's
> >>> per-xprt. Making it global might mean less wasted memory overall, but I
> >>> could forsee situations where a hung mount could starve other mounts of
> >>> slots.
> >>> 
> >>> Figuring that out might help guide the overall design here...
> >>> 
> >>>> For UDP and RDMA, the default # of slot entries can turn into an upper bound.
> >>>> For TCP, the default # of slot entries would have no meaning WRT upper bound, as the upper bound is determined
> >>>> by the TCP window size.
> >>>> 
> >>> 
> >>> Maybe rather than having this dynamic_slot_alloc operation that's only
> >>> conditionally called, it would cleaner to abstract out the slot
> >>> allocator altogether.
> >>> 
> >>> UDP and RDMA could use the same routine and just allocate out of a
> >>> fixed set of preallocated slots (sort of like how they do today), and
> >>> TCP could use this new scheme.
> >> 
> >> We need rpc_slots (UDP/RDMA/TCP/....) to be freed back to the mempool in xprt_free_slot.
> >> If we allocate the set # of rpc_slots, place them on the xprt->free list, and they don't get used,
> >> then they won't be freed until the xprt is destroyed.
> > 
> > The reason to keep slots around and on a local free list is that full-out memory allocation is heavy weight.  I keep thinking of the talk Willy gave at LSF about how much scalability was affected in the SCSI I/O path by simple memory allocation operations.  Is using a SLAB or SLUB as fast as what we have today?
> 
> That is a good question. We could implement this same slot allocation scheme without a SLAB/SLUB - just use kmalloc and leave xprt_free_slot to just move the allocated slots to the free list as is done today. I don't have any information on a performance hit. 
> 
> Is this worth testing, or should we just stick with kmalloc and friends?
> 

I think Chuck's point here is that there is an argument for doing it
the other way around -- keeping a static set of objects and only
dynamically allocating when those are used up.

Of course, pulling them off the free list is not cost-free either (you
do need to take a spinlock there), but it may have less contention.

It's hard for me to imagine that this will make a big difference
overall, but I guess you never know. Personally, I wouldn't worry about
it. :)
Trond Myklebust May 5, 2011, 12:19 p.m. UTC | #22
On Thu, 2011-05-05 at 07:47 -0400, Jeff Layton wrote:
> On Wed, 04 May 2011 11:35:34 -0400
> Trond Myklebust <Trond.Myklebust@netapp.com> wrote:
> 
> > On Wed, 2011-05-04 at 11:20 -0400, Andy Adamson wrote:
> > > On May 4, 2011, at 11:08 AM, Jeff Layton wrote:
> > > 
> > > > On Mon,  2 May 2011 21:40:08 -0400
> > > > andros@netapp.com wrote:
> > > > 
> > > >> +	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
> > > >> +		return;
> > > > 
> > > > Also, I'm not sure that a single bit really conveys enough information
> > > > for this.
> > > > 
> > > > IIUC, sk_write_space gets called when a packet is TCP ACK'ed. It seems
> > > > possible that we would sometimes have buffer space available to queue
> > > > the packet without sk_write_space being called. With this, we'll
> > > > basically be serializing all dynamic slot allocations behind the
> > > > sk_write_space callbacks.
> > > 
> > > Which I thought was OK given that the TCP window takes a while to stabilize.
> > > 
> > > > 
> > > > Consider the case of many small TCP frames being sent after a large one
> > > > just got ACK'ed. Only one would be allowed to be sent, even though
> > > > there might be enough send buffer space to allow for more.
> > > > 
> > > > Would it instead make more sense to base this on the amount of space
> > > > available in the actual socket rather than this bit?
> > > 
> > > So at each write_space, potentially allocate more than one rpc_slot as opposed
> > > to allocating one rpc_slot and waiting for the next write_space? I could look at this
> > > with the 10G testiing.
> > 
> > Why? You can't send that data. Once you hit the write space limit, then
> > the socket remains blocked until you get the callback. It doesn't matter
> > how small the frame, you will not be allowed to send more data.
> > 
> > On the other hand, we do set the SOCK_NOSPACE bit, which means that the
> > socket layer will attempt to grow the TCP window even though we're not
> > actually putting more data into the socket.
> > 
> 
> I'm not sure I understand what you're suggesting here.
> 
> I guess my main point is that a single bit that we flip on in
> write_space and flip off when a slot is allocated doesn't carry enough
> info. That scheme will also be subject to subtle differences in timing.
> For instance...
> 
> Suppose a large number of TCP ACKs come in all at around the same time.
> write_space gets called a bunch of times in succession, so the bit gets
> "set" several times. Several queued tasks get woken up but only one can
> clear the bit so only one gets a slot.
> 
> However, if those acks come in with enough of a delay between them, then
> you can potentially get one slot allocated per write_space callback.

The write space callback doesn't wake anyone up until 1/2 the total
socket buffer is free: that's what the sock_writeable() test does.

> I think we ought to consider a heuristic that doesn't rely on the
> frequency and timing of write_space callbacks.

What we're doing now is basically what is being done by the socket layer
when a user process tries to write too much data to the socket: we tell
the TCP layer to grow the window, and we wait for the write space
callback to tell us that we have enough free socket buffer space to be
able to make progress. We're not waking up and retrying on every ACK as
you suggest.
diff mbox

Patch

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d81db80..3202d09 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -242,6 +242,8 @@  int		rpc_init_mempool(void);
 void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
+void		rpc_free_slot(struct rpc_rqst *req);
+struct rpc_rqst *rpc_alloc_slot(gfp_t gfp);
 
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
 {
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a0f998c..ae3682c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -118,6 +118,7 @@  struct rpc_xprt_ops {
 	void		(*connect)(struct rpc_task *task);
 	void *		(*buf_alloc)(struct rpc_task *task, size_t size);
 	void		(*buf_free)(void *buffer);
+	void		(*dynamic_slot_alloc)(struct rpc_xprt *xprt);
 	int		(*send_request)(struct rpc_task *task);
 	void		(*set_retrans_timeout)(struct rpc_task *task);
 	void		(*timer)(struct rpc_task *task);
@@ -167,7 +168,6 @@  struct rpc_xprt {
 	struct rpc_wait_queue	pending;	/* requests in flight */
 	struct rpc_wait_queue	backlog;	/* waiting for slot */
 	struct list_head	free;		/* free slots */
-	struct rpc_rqst *	slot;		/* slot table storage */
 	unsigned int		max_reqs;	/* total slots */
 	unsigned long		state;		/* transport state */
 	unsigned char		shutdown   : 1,	/* being shut down */
@@ -283,6 +283,9 @@  struct rpc_xprt *	xprt_get(struct rpc_xprt *xprt);
 void			xprt_put(struct rpc_xprt *xprt);
 struct rpc_xprt *	xprt_alloc(struct net *net, int size, int max_req);
 void			xprt_free(struct rpc_xprt *);
+int			xprt_alloc_slot_entries(struct rpc_xprt *xprt,
+						int num_req);
+void			 xprt_add_slot(struct rpc_xprt *xprt);
 
 static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *p)
 {
@@ -321,6 +324,7 @@  void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_CONNECTION_ABORT	(7)
 #define XPRT_CONNECTION_CLOSE	(8)
 #define XPRT_INITIALIZED	(9)
+#define XPRT_WRITE_SPACE	(10)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e7a96e4..8e21d27 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1276,6 +1276,10 @@  call_transmit(struct rpc_task *task)
 	task->tk_action = call_status;
 	if (task->tk_status < 0)
 		return;
+
+	if (task->tk_xprt->ops->dynamic_slot_alloc)
+		task->tk_xprt->ops->dynamic_slot_alloc(task->tk_xprt);
+
 	task->tk_status = xprt_prepare_transmit(task);
 	if (task->tk_status != 0)
 		return;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 6b43ee7..bbd4018 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -33,10 +33,13 @@ 
 #define RPC_BUFFER_MAXSIZE	(2048)
 #define RPC_BUFFER_POOLSIZE	(8)
 #define RPC_TASK_POOLSIZE	(8)
+#define RPC_SLOT_POOLSIZE	(RPC_TASK_POOLSIZE * RPC_DEF_SLOT_TABLE)
 static struct kmem_cache	*rpc_task_slabp __read_mostly;
 static struct kmem_cache	*rpc_buffer_slabp __read_mostly;
+static struct kmem_cache	*rpc_slot_slabp __read_mostly;
 static mempool_t	*rpc_task_mempool __read_mostly;
 static mempool_t	*rpc_buffer_mempool __read_mostly;
+static mempool_t	*rpc_slot_mempool __read_mostly;
 
 static void			rpc_async_schedule(struct work_struct *);
 static void			 rpc_release_task(struct rpc_task *task);
@@ -961,9 +964,33 @@  static void rpciod_stop(void)
 }
 
 void
+rpc_free_slot(struct rpc_rqst *req)
+{
+	return mempool_free(req, rpc_slot_mempool);
+}
+
+/**
+ * rpc_alloc_slot - rpc_slot allocator
+ *
+ * Static rpc_xprt Initialization:
+ *   Called with GFP_KERNEL
+ *
+ * Dynamic allocation:
+ *   Called with GFP_NOWAIT
+ *   Triggered by write_space callback.
+ */
+struct rpc_rqst *
+rpc_alloc_slot(gfp_t gfp)
+{
+	return (struct rpc_rqst *)mempool_alloc(rpc_slot_mempool, gfp);
+}
+
+void
 rpc_destroy_mempool(void)
 {
 	rpciod_stop();
+	if (rpc_slot_mempool)
+		mempool_destroy(rpc_slot_mempool);
 	if (rpc_buffer_mempool)
 		mempool_destroy(rpc_buffer_mempool);
 	if (rpc_task_mempool)
@@ -972,6 +999,8 @@  rpc_destroy_mempool(void)
 		kmem_cache_destroy(rpc_task_slabp);
 	if (rpc_buffer_slabp)
 		kmem_cache_destroy(rpc_buffer_slabp);
+	if (rpc_slot_slabp)
+		kmem_cache_destroy(rpc_slot_slabp);
 	rpc_destroy_wait_queue(&delay_queue);
 }
 
@@ -998,6 +1027,12 @@  rpc_init_mempool(void)
 					     NULL);
 	if (!rpc_buffer_slabp)
 		goto err_nomem;
+	rpc_slot_slabp = kmem_cache_create("rpc_slots",
+					     sizeof(struct rpc_rqst),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (!rpc_slot_slabp)
+		goto err_nomem;
 	rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
 						    rpc_task_slabp);
 	if (!rpc_task_mempool)
@@ -1006,6 +1041,10 @@  rpc_init_mempool(void)
 						      rpc_buffer_slabp);
 	if (!rpc_buffer_mempool)
 		goto err_nomem;
+	rpc_slot_mempool = mempool_create_slab_pool(RPC_SLOT_POOLSIZE,
+						    rpc_slot_slabp);
+	if (!rpc_slot_mempool)
+		goto err_nomem;
 	return 0;
 err_nomem:
 	rpc_destroy_mempool();
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9494c37..1b0aa55 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -498,6 +498,7 @@  void xprt_write_space(struct rpc_xprt *xprt)
 		dprintk("RPC:       write space: waking waiting task on "
 				"xprt %p\n", xprt);
 		rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
+		set_bit(XPRT_WRITE_SPACE, &xprt->state);
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -957,6 +958,66 @@  static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 	spin_unlock(&xprt->reserve_lock);
 }
 
+static void
+xprt_free_slot_entries(struct rpc_xprt *xprt)
+{
+	struct rpc_rqst	*req;
+	int i = 0;
+
+	while (!list_empty(&xprt->free)) {
+		req = list_entry(xprt->free.next, struct rpc_rqst, rq_list);
+		list_del(&req->rq_list);
+		rpc_free_slot(req);
+		i++;
+	}
+	dprintk("<-- %s mempool_free %d reqs\n", __func__, i);
+}
+
+/*
+ * Static transport rpc_slot allocation called only at rpc_xprt allocation.
+ * No need to take the xprt->reserve_lock.
+ */
+int
+xprt_alloc_slot_entries(struct rpc_xprt *xprt, int num_req)
+{
+	struct rpc_rqst *req;
+	int i;
+
+	for (i = 0; i < num_req; i++) {
+		req = rpc_alloc_slot(GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+		memset(req, 0, sizeof(*req));
+		list_add(&req->rq_list, &xprt->free);
+	}
+	dprintk("<-- %s mempool_alloc %d reqs\n", __func__,
+		xprt->max_reqs);
+	return 0;
+}
+
+/*
+ * Dynamic rpc_slot allocator. GFP_NOWAIT will not cause rpciod to sleep.
+ * Return NULL if allocation can't be serviced immediately.
+ * Triggered by write_space callback.
+ */
+void
+xprt_add_slot(struct rpc_xprt *xprt)
+{
+	struct rpc_rqst *req;
+
+	if (!test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state))
+		return;
+	req = rpc_alloc_slot(GFP_NOWAIT);
+	if (!req)
+		return;
+	spin_lock(&xprt->reserve_lock);
+	list_add(&req->rq_list, &xprt->free);
+	xprt->max_reqs += 1;
+	spin_unlock(&xprt->reserve_lock);
+
+	dprintk("RPC	added rpc_slot to transport %p\n", xprt);
+}
+
 struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
 {
 	struct rpc_xprt *xprt;
@@ -967,14 +1028,16 @@  struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
 	atomic_set(&xprt->count, 1);
 
 	xprt->max_reqs = max_req;
-	xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
-	if (xprt->slot == NULL)
+	/* allocate slots and place on free list */
+	INIT_LIST_HEAD(&xprt->free);
+	if (xprt_alloc_slot_entries(xprt, max_req) != 0)
 		goto out_free;
 
 	xprt->xprt_net = get_net(net);
 	return xprt;
 
 out_free:
+	xprt_free_slot_entries(xprt);
 	kfree(xprt);
 out:
 	return NULL;
@@ -984,7 +1047,7 @@  EXPORT_SYMBOL_GPL(xprt_alloc);
 void xprt_free(struct rpc_xprt *xprt)
 {
 	put_net(xprt->xprt_net);
-	kfree(xprt->slot);
+	xprt_free_slot_entries(xprt);
 	kfree(xprt);
 }
 EXPORT_SYMBOL_GPL(xprt_free);
@@ -1080,7 +1143,6 @@  void xprt_release(struct rpc_task *task)
 struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
 	struct rpc_xprt	*xprt;
-	struct rpc_rqst	*req;
 	struct xprt_class *t;
 
 	spin_lock(&xprt_list_lock);
@@ -1108,7 +1170,6 @@  found:
 	spin_lock_init(&xprt->transport_lock);
 	spin_lock_init(&xprt->reserve_lock);
 
-	INIT_LIST_HEAD(&xprt->free);
 	INIT_LIST_HEAD(&xprt->recv);
 #if defined(CONFIG_NFS_V4_1)
 	spin_lock_init(&xprt->bc_pa_lock);
@@ -1131,10 +1192,6 @@  found:
 	rpc_init_wait_queue(&xprt->resend, "xprt_resend");
 	rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
 
-	/* initialize free list */
-	for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--)
-		list_add(&req->rq_list, &xprt->free);
-
 	xprt_init_xid(xprt);
 
 	dprintk("RPC:       created transport %p with %u slots\n", xprt,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf005d3..8ab2801 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2115,6 +2115,7 @@  static struct rpc_xprt_ops xs_tcp_ops = {
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
 	.buf_free		= rpc_free,
+	.dynamic_slot_alloc	= xprt_add_slot,
 	.send_request		= xs_tcp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
 	.close			= xs_tcp_close,